mlrun · Eyal-Danieli · Dec 31, 2024 · Dec 31, 2024 · Dec 31, 2024
diff --git a/text_to_audio_generator/function.yaml b/text_to_audio_generator/function.yaml
diff --git a/text_to_audio_generator/item.yaml b/text_to_audio_generator/item.yaml
@@ -13,7 +13,7 @@ labels:
   author: yonatans
 maintainers: []
 marketplaceType: ''
-mlrunVersion: 1.5.1
+mlrunVersion: 1.7.1
 name: text_to_audio_generator
 platformVersion: 3.5.3
 spec:
@@ -22,8 +22,9 @@ spec:
   image: mlrun/mlrun
   kind: job
   requirements:
-    - bark
+    - openai
     - torchaudio
+    - pydub
 url: ''
-version: 1.2.0
+version: 1.3.0
 test_valid: True
diff --git a/text_to_audio_generator/requirements.txt b/text_to_audio_generator/requirements.txt
@@ -1,2 +1,3 @@
-bark
-torchaudio>=2.1.0
+openai>=1.58.0
+torchaudio>=2.1.0
+pydub
diff --git a/text_to_audio_generator/test_text_to_audio_generator.py b/text_to_audio_generator/test_text_to_audio_generator.py
@@ -12,11 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import mlrun
+import os
 import tempfile
+
+import mlrun
 import pytest
 
 
+@pytest.mark.skipif(
+    condition=os.getenv("OPENAI_BASE_URL") is None
+    and os.getenv("OPENAI_API_KEY") is None,
+    reason="OpenAI API key and base URL are required to run this test",
+)
 @pytest.mark.parametrize("file_format,bits_per_sample", [("wav", 8), ("mp3", None)])
 def test_generate_multi_speakers_audio(file_format, bits_per_sample):
     text_to_audio_generator_function = mlrun.import_function("function.yaml")
@@ -28,12 +35,9 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample):
                 "output_directory": test_directory,
                 "speakers": {"Agent": 0, "Client": 1},
                 "available_voices": [
-                    "v2/en_speaker_0",
-                    "v2/en_speaker_1",
+                    "alloy",
+                    "echo",
                 ],
-                "use_small_models": True,
-                "use_gpu": False,
-                "offload_cpu": True,
                 "file_format": file_format,
                 "bits_per_sample": bits_per_sample,
             },
@@ -45,6 +49,6 @@ def test_generate_multi_speakers_audio(file_format, bits_per_sample):
             ],
             artifact_path=test_directory,
         )
-    assert function_run.error == "Run state (completed) is not in error state"
+    assert function_run.error == ""
     for key in ["audio_files", "audio_files_dataframe", "text_to_speech_errors"]:
         assert key in function_run.outputs and function_run.outputs[key] is not None
diff --git a/text_to_audio_generator/text_to_audio_generator.ipynb b/text_to_audio_generator/text_to_audio_generator.ipynb
@@ -31,10 +31,7 @@
    "id": "bb20c4a6-f362-40e6-8f73-9145953959ec",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import mlrun\n",
-    "import tempfile"
-   ]
+   "source": "import mlrun"
   },
   {
    "cell_type": "code",
@@ -322,12 +319,9 @@
     "        \"output_directory\": \"./out\",\n",
     "        \"speakers\": {\"Agent\": 0, \"Client\": 1},\n",
     "        \"available_voices\": [\n",
-    "            \"v2/en_speaker_0\",\n",
-    "            \"v2/en_speaker_1\",\n",
+    "           \"alloy\",\n",
+    "            \"echo\",\n",
     "        ],\n",
-    "        \"use_small_models\": True,\n",
-    "        \"use_gpu\": False,\n",
-    "        \"offload_cpu\": True,\n",
     "        \"file_format\": \"mp3\",\n",
     "        # \"bits_per_sample\": 8,\n",
     "    },\n",

diff --git a/text_to_audio_generator/text_to_audio_generator.py b/text_to_audio_generator/text_to_audio_generator.py
@@ -11,35 +11,41 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import io
 import logging
+import os
 import pathlib
 import random
 import tempfile
 from typing import Dict, List, Optional, Tuple, Union
 
-import bark
 import numpy as np
+import openai
 import pandas as pd
 import torch
 import torchaudio
 import tqdm
+from pydub import AudioSegment
 
 # Get the global logger:
 _LOGGER = logging.getLogger()
 
+OPENAI_API_KEY = "OPENAI_API_KEY"
+OPENAI_BASE_URL = "OPENAI_BASE_URL"
+SAMPLE_RATE = 24000
+
 
 def generate_multi_speakers_audio(
     data_path: str,
     speakers: Union[List[str], Dict[str, int]],
     available_voices: List[str],
     output_directory: str = None,
-    use_gpu: bool = True,
-    use_small_models: bool = False,
-    offload_cpu: bool = False,
+    model: str = "tts-1",
     sample_rate: int = 16000,
     file_format: str = "wav",
     verbose: bool = True,
     bits_per_sample: Optional[int] = None,
+    speed: float = 1.0,
 ) -> Tuple[str, pd.DataFrame, dict]:
     """
     Generate audio files from text files.
@@ -50,21 +56,20 @@ def generate_multi_speakers_audio(
                                 If dictionary, the keys will be the speakers and the values will be the channels.
     :param available_voices:    List of available voices to use for the generation.
                         See here for the available voices:
-                        https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
+                        https://platform.openai.com/docs/guides/text-to-speech#voice-options
     :param output_directory:    Path to the directory to save the generated audio files to.
-    :param use_gpu:             Whether to use the GPU for the generation.
-    :param use_small_models:    Whether to use the small models for the generation.
-    :param offload_cpu:         To reduce the memory footprint, the models can be offloaded to the CPU after loading.
+    :param model:               Which model to use for the generation.
     :param sample_rate:         The sampling rate of the generated audio.
     :param file_format:         The format of the generated audio files.
     :param verbose:             Whether to print the progress of the generation.
     :param bits_per_sample:     Changes the bit depth for the supported formats.
                                 Supported only in "wav" or "flac" formats.
+    :param speed:               The speed of the generated audio. Select a value from `0.25` to `4.0`. `1.0` is the default.
 
     :returns:                   A tuple of:
                                 - The output directory path.
                                 - The generated audio files dataframe.
-                                - The errors dictionary.
+                                - The errors' dictionary.
     """
 
     global _LOGGER
@@ -73,17 +78,8 @@ def generate_multi_speakers_audio(
     data_path = pathlib.Path(data_path).absolute()
     text_files = _get_text_files(data_path=data_path)
 
-    # Load the bark models according to the given configurations:
-    bark.preload_models(
-        text_use_gpu=use_gpu,
-        text_use_small=use_small_models,
-        coarse_use_gpu=use_gpu,
-        coarse_use_small=use_small_models,
-        fine_use_gpu=use_gpu,
-        fine_use_small=use_small_models,
-        codec_use_gpu=use_gpu,
-        force_reload=offload_cpu,
-    )
+    # connect to openai client:
+    client = _get_openai_client()
 
     # Check for per channel generation:
     if isinstance(speakers, dict):
@@ -98,11 +94,11 @@ def generate_multi_speakers_audio(
 
     # Prepare the resampling module:
     resampler = torchaudio.transforms.Resample(
-        orig_freq=bark.SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
+        orig_freq=SAMPLE_RATE, new_freq=sample_rate, dtype=torch.float32
     )
 
     # Prepare the gap between each speaker:
-    gap_between_speakers = np.zeros(int(0.5 * bark.SAMPLE_RATE))
+    gap_between_speakers = np.zeros(int(0.5 * SAMPLE_RATE))
 
     # Prepare the successes dataframe and errors dictionary to be returned:
     successes = []
@@ -156,11 +152,16 @@ def generate_multi_speakers_audio(
                     )
                 for sentence in _split_line(line=sentences):
                     # Generate words audio:
-                    audio = bark.generate_audio(
-                        sentence,
-                        history_prompt=chosen_voices[current_speaker],
-                        silent=True,
+                    audio = client.audio.speech.create(
+                        model=model,
+                        input=sentence,
+                        voice=chosen_voices[current_speaker],
+                        response_format=file_format,
+                        speed=speed,
                     )
+                    audio = audio.content
+                    audio = _bytes_to_np_array(audio=audio, file_format=file_format)
+
                     if speaker_per_channel:
                         silence = np.zeros_like(audio)
                         for speaker in audio_pieces.keys():
@@ -214,6 +215,43 @@ def generate_multi_speakers_audio(
     return str(output_directory), successes, errors
 
 
+def _get_openai_client():
+    api_key = os.getenv(OPENAI_API_KEY)
+    base_url = os.getenv(OPENAI_BASE_URL)
+    # Check if the key is already in the environment variables:
+    if not api_key or not base_url:
+        try:
+            import mlrun
+
+            context = mlrun.get_or_create_ctx(name="context")
+            # Check if the key is in the secrets:
+            api_key = context.get_secret(OPENAI_API_KEY)
+            base_url = context.get_secret(OPENAI_BASE_URL)
+        except ModuleNotFoundError:
+            raise EnvironmentError(
+                f"One or more of the OpenAI required environment variables ('{OPENAI_API_KEY}', '{OPENAI_BASE_URL}') are missing."
+                f"Please set them as environment variables or install mlrun (`pip install mlrun`)"
+                f"and set them as project secrets using `project.set_secrets`."
+            )
+    return openai.OpenAI(api_key=api_key, base_url=base_url)
+
+
+def _bytes_to_np_array(audio: bytes, file_format: str):
+    if file_format == "mp3":
+        audio_segment = AudioSegment.from_mp3(io.BytesIO(audio))
+
+        # Convert to raw PCM audio data
+        samples = audio_segment.get_array_of_samples()
+
+        # Convert to numpy array
+        audio_array = np.array(samples)
+
+        # Normalize to float between -1 and 1
+        return audio_array.astype(np.float32) / np.iinfo(samples.typecode).max
+    else:
+        return np.frombuffer(audio, dtype=np.int16) / 32768.0
+
+
 def _get_text_files(
     data_path: pathlib.Path,
 ) -> List[pathlib.Path]:
@@ -261,6 +299,7 @@ def _get_logger():
     global _LOGGER
     try:
         import mlrun
+
         # Check if MLRun is available:
         context = mlrun.get_or_create_ctx(name="mlrun")
         return context.logger