Don't duplicate data when encoding audio or image (#4187)

* don't duplicate data in audio * don't duplicate data in image * one more comment
huggingface · Apr 21, 2022 · b564af7 · b564af7 · github-actions · Apr 21, 2022
1 parent 966d3bc
commit b564af7
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 2 deletions.
diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py
@@ -1,3 +1,4 @@
+import os
 from dataclasses import dataclass, field
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Union
@@ -70,11 +71,16 @@ def encode_example(self, value: Union[str, dict]) -> dict:
             raise ImportError("To support encoding audio data, please install 'soundfile'.") from err
         if isinstance(value, str):
             return {"bytes": None, "path": value}
-        elif isinstance(value, dict) and "array" in value:
+        elif "array" in value:
+            # convert the audio array to wav bytes
             buffer = BytesIO()
             sf.write(buffer, value["array"], value["sampling_rate"], format="wav")
-            return {"bytes": buffer.getvalue(), "path": value.get("path")}
+            return {"bytes": buffer.getvalue(), "path": None}
+        elif value.get("path") is not None and os.path.isfile(value["path"]):
+            # we set "bytes": None to not duplicate the data if they're already available locally
+            return {"bytes": None, "path": value.get("path")}
         elif value.get("bytes") is not None or value.get("path") is not None:
+            # store the audio bytes, and path is used to infer the audio format using the file extension
             return {"bytes": value.get("bytes"), "path": value.get("path")}
         else:
             raise ValueError(

diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py
@@ -1,3 +1,4 @@
+import os
 from dataclasses import dataclass, field
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union
@@ -69,11 +70,17 @@ def encode_example(self, value: Union[str, dict, np.ndarray, "PIL.Image.Image"])
         if isinstance(value, str):
             return {"path": value, "bytes": None}
         elif isinstance(value, np.ndarray):
+            # convert the image array to png bytes
             image = PIL.Image.fromarray(value.astype(np.uint8))
             return {"path": None, "bytes": image_to_bytes(image)}
         elif isinstance(value, PIL.Image.Image):
+            # convert the PIL image to bytes (default format is png)
             return encode_pil_image(value)
+        elif value.get("path") is not None and os.path.isfile(value["path"]):
+            # we set "bytes": None to not duplicate the data if they're already available locally
+            return {"bytes": None, "path": value.get("path")}
         elif value.get("bytes") is not None or value.get("path") is not None:
+            # store the image bytes, and path is used to infer the image format using the file extension
             return {"bytes": value.get("bytes"), "path": value.get("path")}
         else:
             raise ValueError(