diff --git a/nptdms/export/pandas_export.py b/nptdms/export/pandas_export.py index f67aa19..1d15d4f 100644 --- a/nptdms/export/pandas_export.py +++ b/nptdms/export/pandas_export.py @@ -2,7 +2,8 @@ import numpy as np -def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True): +def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True, + arrow_dtypes=False): """ Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths. @@ -12,6 +13,7 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data values are absolute times or relative to the start time. :param scaled_data: By default the scaled data will be used. Set to False to use raw unscaled data. + :param arrow_dtypes: Use PyArrow data types in the DataFrame. :return: The full TDMS file data. :rtype: pandas.DataFrame """ @@ -20,10 +22,13 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data for group in tdms_file.groups(): for channel in group.channels(): channels_to_export[channel.path] = channel - return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data) + return _channels_to_dataframe( + channels_to_export, + time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data, + arrow_dtypes=arrow_dtypes) -def from_group(group, time_index=False, absolute_time=False, scaled_data=True): +def from_group(group, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False): """ Converts a TDMS group object to a DataFrame. DataFrame columns are named using the channel names. @@ -33,15 +38,19 @@ def from_group(group, time_index=False, absolute_time=False, scaled_data=True): values are absolute times or relative to the start time. :param scaled_data: By default the scaled data will be used. Set to False to use raw unscaled data. + :param arrow_dtypes: Use PyArrow data types in the DataFrame. :return: The TDMS object data. :rtype: pandas.DataFrame """ channels_to_export = OrderedDict((ch.name, ch) for ch in group.channels()) - return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data) + return _channels_to_dataframe( + channels_to_export, time_index=time_index, absolute_time=absolute_time, + scaled_data=scaled_data, arrow_dtypes=arrow_dtypes) -def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True): +def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True, + arrow_dtypes=False): """ Converts the TDMS channel to a DataFrame @@ -51,32 +60,54 @@ def from_channel(channel, time_index=False, absolute_time=False, scaled_data=Tru values are absolute times or relative to the start time. :param scaled_data: By default the scaled data will be used. Set to False to use raw unscaled data. + :param arrow_dtypes: Use PyArrow data types in the DataFrame. :return: The TDMS object data. :rtype: pandas.DataFrame """ channels_to_export = {channel.path: channel} - return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data) + return _channels_to_dataframe( + channels_to_export, time_index=time_index, absolute_time=absolute_time, + scaled_data=scaled_data, arrow_dtypes=arrow_dtypes) -def _channels_to_dataframe(channels_to_export, time_index=False, absolute_time=False, scaled_data=True): +def _channels_to_dataframe( + channels_to_export, time_index=False, absolute_time=False, scaled_data=True, + arrow_dtypes=False): import pandas as pd - dataframe_dict = OrderedDict() + column_data = [] for column_name, channel in channels_to_export.items(): index = channel.time_track(absolute_time) if time_index else None if scaled_data: - dataframe_dict[column_name] = pd.Series(data=_array_for_pd(channel[:]), index=index) + column_data.append((column_name, _array_for_pd(channel[:]), index)) elif channel.scaler_data_types: # Channel has DAQmx raw data raw_data = channel.read_data(scaled=False) for scale_id, scaler_data in raw_data.items(): scaler_column_name = column_name + "[{0:d}]".format(scale_id) - dataframe_dict[scaler_column_name] = pd.Series(data=scaler_data, index=index) + column_data.append((scaler_column_name, scaler_data, index)) else: # Raw data for normal TDMS file raw_data = channel.read_data(scaled=False) - dataframe_dict[column_name] = pd.Series(data=_array_for_pd(raw_data), index=index) + column_data.append((column_name, _array_for_pd(raw_data), index)) + + dataframe_dict = OrderedDict() + if arrow_dtypes: + import pyarrow as pa + + for column_name, data, index in column_data: + # Let arrow deduce data types from the numpy dtypes + if index is not None: + index_array = pa.array(index) + index = pd.Index(index_array, dtype=pd.ArrowDtype(index_array.type)) + data_array = pa.array(data) + dataframe_dict[column_name] = pd.Series( + data=data_array, dtype=pd.ArrowDtype(data_array.type), index=index) + else: + for column_name, data, index in column_data: + dataframe_dict[column_name] = pd.Series(data=data, index=index) + return pd.DataFrame.from_dict(dataframe_dict) diff --git a/nptdms/tdms.py b/nptdms/tdms.py index 72a8d27..7694f44 100644 --- a/nptdms/tdms.py +++ b/nptdms/tdms.py @@ -160,7 +160,7 @@ def properties(self): return self._properties - def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True): + def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False): """ Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths. @@ -170,11 +170,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True): :param scaled_data: By default the scaled data will be used. Set to False to use raw unscaled data. For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id. + :param arrow_dtypes: Use PyArrow data types in the DataFrame. :return: The full TDMS file data. :rtype: pandas.DataFrame """ - return pandas_export.from_tdms_file(self, time_index, absolute_time, scaled_data) + return pandas_export.from_tdms_file( + self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data, + arrow_dtypes=arrow_dtypes) def as_hdf(self, filepath, mode='w', group='/'): """ @@ -388,7 +391,7 @@ def channels(self): """ return list(self._channels.values()) - def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True): + def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False): """ Converts the TDMS group to a DataFrame. DataFrame columns are named using the channel names. @@ -398,11 +401,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True): :param scaled_data: By default the scaled data will be used. Set to False to use raw unscaled data. For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id. + :param arrow_dtypes: Use PyArrow data types in the DataFrame. :return: The TDMS object data. :rtype: pandas.DataFrame """ - return pandas_export.from_group(self, time_index, absolute_time, scaled_data) + return pandas_export.from_group( + self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data, + arrow_dtypes=arrow_dtypes) def __len__(self): """ Returns the number of channels in this group @@ -692,7 +698,7 @@ def time_track(self, absolute_time=False, accuracy='ns'): return (start_time + (relative_time * unit_correction).astype(time_type)) - def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True): + def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False): """ Converts the TDMS channel to a DataFrame. The DataFrame column is named using the channel path. @@ -702,11 +708,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True): :param scaled_data: By default the scaled data will be used. Set to False to use raw unscaled data. For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id. + :param arrow_dtypes: Use PyArrow data types in the DataFrame. :return: The TDMS object data. :rtype: pandas.DataFrame """ - return pandas_export.from_channel(self, time_index, absolute_time, scaled_data) + return pandas_export.from_channel( + self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data, + arrow_dtypes=arrow_dtypes) def _read_data_values(self): for chunk in self.data_chunks(): diff --git a/nptdms/test/test_pandas.py b/nptdms/test/test_pandas.py index cfa1d26..22c3f21 100644 --- a/nptdms/test/test_pandas.py +++ b/nptdms/test/test_pandas.py @@ -189,7 +189,7 @@ def test_file_as_dataframe_with_absolute_time(): df = tdms_data.as_dataframe(time_index=True, absolute_time=True) expected_start = datetime(2015, 9, 8, 10, 5, 49) - assert (df.index == expected_start)[0] + assert (df.index[0] == expected_start) @pytest.mark.parametrize('lazy_load', [True, False]) @@ -321,6 +321,37 @@ def test_raw_daqmx_channel_export(lazy_load): np.testing.assert_equal(dataframe["/'Group'/'Channel1'[1]"], expected_data[1]) +@pytest.mark.parametrize('abs_time_index', [False, True]) +def test_dataframe_with_arrow_types(abs_time_index): + test_file = GeneratedFile() + test_file.add_segment(*timed_segment()) + + tdms_data = test_file.load() + + file_df = tdms_data.as_dataframe( + arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index) + + group_df = tdms_data['Group'].as_dataframe( + arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index) + + channel_df = tdms_data['Group']['Channel1'].as_dataframe( + arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index) + + assert len(file_df) == 2 + assert "/'Group'/'Channel1'" in file_df.keys() + assert "/'Group'/'Channel2'" in file_df.keys() + + def check_series(series): + assert (series == [1, 2]).all() + assert series.dtype == "int32[pyarrow]" + if abs_time_index: + assert series.index.dtype == "timestamp[ns][pyarrow]" + + check_series(file_df["/'Group'/'Channel1'"]) + check_series(group_df['Channel1']) + check_series(channel_df["/'Group'/'Channel1'"]) + + def test_export_with_empty_channels(): """Convert a group to dataframe when a channel has empty data and void data type""" diff --git a/setup.cfg b/setup.cfg index 829038c..2e3643e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ test = scipy pandas = pandas + pyarrow hdf = h5py >= 2.10.0 thermocouple_scaling =