Support using Arrow dtypes when converting to Pandas

adamreeve · Nov 5, 2023 · c0744c4 · c0744c4
1 parent 58b9808
commit c0744c4
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 18 deletions.
diff --git a/nptdms/export/pandas_export.py b/nptdms/export/pandas_export.py
@@ -2,7 +2,8 @@
 import numpy as np
 
 
-def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True):
+def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True,
+                   arrow_dtypes=False):
     """
     Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
 
@@ -12,6 +13,7 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
         values are absolute times or relative to the start time.
     :param scaled_data: By default the scaled data will be used.
         Set to False to use raw unscaled data.
+    :param arrow_dtypes: Use PyArrow data types in the DataFrame.
     :return: The full TDMS file data.
     :rtype: pandas.DataFrame
     """
@@ -20,10 +22,13 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
     for group in tdms_file.groups():
         for channel in group.channels():
             channels_to_export[channel.path] = channel
-    return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
+    return _channels_to_dataframe(
+        channels_to_export,
+        time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+        arrow_dtypes=arrow_dtypes)
 
 
-def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
+def from_group(group, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
     """
     Converts a TDMS group object to a DataFrame. DataFrame columns are named using the channel names.
 
@@ -33,15 +38,19 @@ def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
         values are absolute times or relative to the start time.
     :param scaled_data: By default the scaled data will be used.
         Set to False to use raw unscaled data.
+    :param arrow_dtypes: Use PyArrow data types in the DataFrame.
     :return: The TDMS object data.
     :rtype: pandas.DataFrame
     """
 
     channels_to_export = OrderedDict((ch.name, ch) for ch in group.channels())
-    return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
+    return _channels_to_dataframe(
+        channels_to_export, time_index=time_index, absolute_time=absolute_time,
+        scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)
 
 
-def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True):
+def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True,
+                 arrow_dtypes=False):
     """
     Converts the TDMS channel to a DataFrame
 
@@ -51,32 +60,54 @@ def from_channel(channel, time_index=False, absolute_time=False, scaled_data=Tru
         values are absolute times or relative to the start time.
     :param scaled_data: By default the scaled data will be used.
         Set to False to use raw unscaled data.
+    :param arrow_dtypes: Use PyArrow data types in the DataFrame.
     :return: The TDMS object data.
     :rtype: pandas.DataFrame
     """
 
     channels_to_export = {channel.path: channel}
-    return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
+    return _channels_to_dataframe(
+        channels_to_export, time_index=time_index, absolute_time=absolute_time,
+        scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)
 
 
-def _channels_to_dataframe(channels_to_export, time_index=False, absolute_time=False, scaled_data=True):
+def _channels_to_dataframe(
+        channels_to_export, time_index=False, absolute_time=False, scaled_data=True,
+        arrow_dtypes=False):
     import pandas as pd
 
-    dataframe_dict = OrderedDict()
+    column_data = []
     for column_name, channel in channels_to_export.items():
         index = channel.time_track(absolute_time) if time_index else None
         if scaled_data:
-            dataframe_dict[column_name] = pd.Series(data=_array_for_pd(channel[:]), index=index)
+            column_data.append((column_name, _array_for_pd(channel[:]), index))
         elif channel.scaler_data_types:
             # Channel has DAQmx raw data
             raw_data = channel.read_data(scaled=False)
             for scale_id, scaler_data in raw_data.items():
                 scaler_column_name = column_name + "[{0:d}]".format(scale_id)
-                dataframe_dict[scaler_column_name] = pd.Series(data=scaler_data, index=index)
+                column_data.append((scaler_column_name, scaler_data, index))
         else:
             # Raw data for normal TDMS file
             raw_data = channel.read_data(scaled=False)
-            dataframe_dict[column_name] = pd.Series(data=_array_for_pd(raw_data), index=index)
+            column_data.append((column_name, _array_for_pd(raw_data), index))
+
+    dataframe_dict = OrderedDict()
+    if arrow_dtypes:
+        import pyarrow as pa
+
+        for column_name, data, index in column_data:
+            # Let arrow deduce data types from the numpy dtypes
+            if index is not None:
+                index_array = pa.array(index)
+                index = pd.Index(index_array, dtype=pd.ArrowDtype(index_array.type))
+            data_array = pa.array(data)
+            dataframe_dict[column_name] = pd.Series(
+                data=data_array, dtype=pd.ArrowDtype(data_array.type), index=index)
+    else:
+        for column_name, data, index in column_data:
+            dataframe_dict[column_name] = pd.Series(data=data, index=index)
+
     return pd.DataFrame.from_dict(dataframe_dict)
 
 

diff --git a/nptdms/tdms.py b/nptdms/tdms.py
@@ -160,7 +160,7 @@ def properties(self):
 
         return self._properties
 
-    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
+    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
         """
         Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
 
@@ -170,11 +170,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
         :param scaled_data: By default the scaled data will be used.
             Set to False to use raw unscaled data.
             For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
+        :param arrow_dtypes: Use PyArrow data types in the DataFrame.
         :return: The full TDMS file data.
         :rtype: pandas.DataFrame
         """
 
-        return pandas_export.from_tdms_file(self, time_index, absolute_time, scaled_data)
+        return pandas_export.from_tdms_file(
+            self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+            arrow_dtypes=arrow_dtypes)
 
     def as_hdf(self, filepath, mode='w', group='/'):
         """
@@ -388,7 +391,7 @@ def channels(self):
         """
         return list(self._channels.values())
 
-    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
+    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
         """
         Converts the TDMS group to a DataFrame. DataFrame columns are named using the channel names.
 
@@ -398,11 +401,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
         :param scaled_data: By default the scaled data will be used.
             Set to False to use raw unscaled data.
             For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
+        :param arrow_dtypes: Use PyArrow data types in the DataFrame.
         :return: The TDMS object data.
         :rtype: pandas.DataFrame
         """
 
-        return pandas_export.from_group(self, time_index, absolute_time, scaled_data)
+        return pandas_export.from_group(
+            self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+            arrow_dtypes=arrow_dtypes)
 
     def __len__(self):
         """ Returns the number of channels in this group
@@ -692,7 +698,7 @@ def time_track(self, absolute_time=False, accuracy='ns'):
         return (start_time +
                 (relative_time * unit_correction).astype(time_type))
 
-    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
+    def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
         """
         Converts the TDMS channel to a DataFrame. The DataFrame column is named using the channel path.
 
@@ -702,11 +708,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
         :param scaled_data: By default the scaled data will be used.
             Set to False to use raw unscaled data.
             For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
+        :param arrow_dtypes: Use PyArrow data types in the DataFrame.
         :return: The TDMS object data.
         :rtype: pandas.DataFrame
         """
 
-        return pandas_export.from_channel(self, time_index, absolute_time, scaled_data)
+        return pandas_export.from_channel(
+            self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
+            arrow_dtypes=arrow_dtypes)
 
     def _read_data_values(self):
         for chunk in self.data_chunks():

diff --git a/nptdms/test/test_pandas.py b/nptdms/test/test_pandas.py
@@ -189,7 +189,7 @@ def test_file_as_dataframe_with_absolute_time():
     df = tdms_data.as_dataframe(time_index=True, absolute_time=True)
 
     expected_start = datetime(2015, 9, 8, 10, 5, 49)
-    assert (df.index == expected_start)[0]
+    assert (df.index[0] == expected_start)
 
 
 @pytest.mark.parametrize('lazy_load', [True, False])
@@ -321,6 +321,37 @@ def test_raw_daqmx_channel_export(lazy_load):
     np.testing.assert_equal(dataframe["/'Group'/'Channel1'[1]"], expected_data[1])
 
 
+@pytest.mark.parametrize('abs_time_index', [False, True])
+def test_dataframe_with_arrow_types(abs_time_index):
+    test_file = GeneratedFile()
+    test_file.add_segment(*timed_segment())
+
+    tdms_data = test_file.load()
+
+    file_df = tdms_data.as_dataframe(
+        arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)
+
+    group_df = tdms_data['Group'].as_dataframe(
+        arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)
+
+    channel_df = tdms_data['Group']['Channel1'].as_dataframe(
+        arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)
+
+    assert len(file_df) == 2
+    assert "/'Group'/'Channel1'" in file_df.keys()
+    assert "/'Group'/'Channel2'" in file_df.keys()
+
+    def check_series(series):
+        assert (series == [1, 2]).all()
+        assert series.dtype == "int32[pyarrow]"
+        if abs_time_index:
+            assert series.index.dtype == "timestamp[ns][pyarrow]"
+
+    check_series(file_df["/'Group'/'Channel1'"])
+    check_series(group_df['Channel1'])
+    check_series(channel_df["/'Group'/'Channel1'"])
+
+
 def test_export_with_empty_channels():
     """Convert a group to dataframe when a channel has empty data and void data type"""
 

diff --git a/setup.cfg b/setup.cfg
@@ -42,6 +42,7 @@ test =
     scipy
 pandas =
     pandas
+    pyarrow
 hdf =
     h5py >= 2.10.0
 thermocouple_scaling =