Skip to content

Commit

Permalink
Support using Arrow dtypes when converting to Pandas
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreeve committed Nov 5, 2023
1 parent 58b9808 commit c0744c4
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 18 deletions.
53 changes: 42 additions & 11 deletions nptdms/export/pandas_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import numpy as np


def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True):
def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data=True,
arrow_dtypes=False):
"""
Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
Expand All @@ -12,6 +13,7 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
values are absolute times or relative to the start time.
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The full TDMS file data.
:rtype: pandas.DataFrame
"""
Expand All @@ -20,10 +22,13 @@ def from_tdms_file(tdms_file, time_index=False, absolute_time=False, scaled_data
for group in tdms_file.groups():
for channel in group.channels():
channels_to_export[channel.path] = channel
return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
return _channels_to_dataframe(
channels_to_export,
time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)


def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
def from_group(group, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts a TDMS group object to a DataFrame. DataFrame columns are named using the channel names.
Expand All @@ -33,15 +38,19 @@ def from_group(group, time_index=False, absolute_time=False, scaled_data=True):
values are absolute times or relative to the start time.
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

channels_to_export = OrderedDict((ch.name, ch) for ch in group.channels())
return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
return _channels_to_dataframe(
channels_to_export, time_index=time_index, absolute_time=absolute_time,
scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)


def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True):
def from_channel(channel, time_index=False, absolute_time=False, scaled_data=True,
arrow_dtypes=False):
"""
Converts the TDMS channel to a DataFrame
Expand All @@ -51,32 +60,54 @@ def from_channel(channel, time_index=False, absolute_time=False, scaled_data=Tru
values are absolute times or relative to the start time.
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

channels_to_export = {channel.path: channel}
return _channels_to_dataframe(channels_to_export, time_index, absolute_time, scaled_data)
return _channels_to_dataframe(
channels_to_export, time_index=time_index, absolute_time=absolute_time,
scaled_data=scaled_data, arrow_dtypes=arrow_dtypes)


def _channels_to_dataframe(channels_to_export, time_index=False, absolute_time=False, scaled_data=True):
def _channels_to_dataframe(
channels_to_export, time_index=False, absolute_time=False, scaled_data=True,
arrow_dtypes=False):
import pandas as pd

dataframe_dict = OrderedDict()
column_data = []
for column_name, channel in channels_to_export.items():
index = channel.time_track(absolute_time) if time_index else None
if scaled_data:
dataframe_dict[column_name] = pd.Series(data=_array_for_pd(channel[:]), index=index)
column_data.append((column_name, _array_for_pd(channel[:]), index))
elif channel.scaler_data_types:
# Channel has DAQmx raw data
raw_data = channel.read_data(scaled=False)
for scale_id, scaler_data in raw_data.items():
scaler_column_name = column_name + "[{0:d}]".format(scale_id)
dataframe_dict[scaler_column_name] = pd.Series(data=scaler_data, index=index)
column_data.append((scaler_column_name, scaler_data, index))
else:
# Raw data for normal TDMS file
raw_data = channel.read_data(scaled=False)
dataframe_dict[column_name] = pd.Series(data=_array_for_pd(raw_data), index=index)
column_data.append((column_name, _array_for_pd(raw_data), index))

dataframe_dict = OrderedDict()
if arrow_dtypes:
import pyarrow as pa

for column_name, data, index in column_data:
# Let arrow deduce data types from the numpy dtypes
if index is not None:
index_array = pa.array(index)
index = pd.Index(index_array, dtype=pd.ArrowDtype(index_array.type))
data_array = pa.array(data)
dataframe_dict[column_name] = pd.Series(
data=data_array, dtype=pd.ArrowDtype(data_array.type), index=index)
else:
for column_name, data, index in column_data:
dataframe_dict[column_name] = pd.Series(data=data, index=index)

return pd.DataFrame.from_dict(dataframe_dict)


Expand Down
21 changes: 15 additions & 6 deletions nptdms/tdms.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def properties(self):

return self._properties

def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts the TDMS file to a DataFrame. DataFrame columns are named using the TDMS object paths.
Expand All @@ -170,11 +170,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The full TDMS file data.
:rtype: pandas.DataFrame
"""

return pandas_export.from_tdms_file(self, time_index, absolute_time, scaled_data)
return pandas_export.from_tdms_file(
self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)

def as_hdf(self, filepath, mode='w', group='/'):
"""
Expand Down Expand Up @@ -388,7 +391,7 @@ def channels(self):
"""
return list(self._channels.values())

def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts the TDMS group to a DataFrame. DataFrame columns are named using the channel names.
Expand All @@ -398,11 +401,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

return pandas_export.from_group(self, time_index, absolute_time, scaled_data)
return pandas_export.from_group(
self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)

def __len__(self):
""" Returns the number of channels in this group
Expand Down Expand Up @@ -692,7 +698,7 @@ def time_track(self, absolute_time=False, accuracy='ns'):
return (start_time +
(relative_time * unit_correction).astype(time_type))

def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True, arrow_dtypes=False):
"""
Converts the TDMS channel to a DataFrame. The DataFrame column is named using the channel path.
Expand All @@ -702,11 +708,14 @@ def as_dataframe(self, time_index=False, absolute_time=False, scaled_data=True):
:param scaled_data: By default the scaled data will be used.
Set to False to use raw unscaled data.
For DAQmx data, there will be one column per DAQmx raw scaler and column names will include the scale id.
:param arrow_dtypes: Use PyArrow data types in the DataFrame.
:return: The TDMS object data.
:rtype: pandas.DataFrame
"""

return pandas_export.from_channel(self, time_index, absolute_time, scaled_data)
return pandas_export.from_channel(
self, time_index=time_index, absolute_time=absolute_time, scaled_data=scaled_data,
arrow_dtypes=arrow_dtypes)

def _read_data_values(self):
for chunk in self.data_chunks():
Expand Down
33 changes: 32 additions & 1 deletion nptdms/test/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def test_file_as_dataframe_with_absolute_time():
df = tdms_data.as_dataframe(time_index=True, absolute_time=True)

expected_start = datetime(2015, 9, 8, 10, 5, 49)
assert (df.index == expected_start)[0]
assert (df.index[0] == expected_start)


@pytest.mark.parametrize('lazy_load', [True, False])
Expand Down Expand Up @@ -321,6 +321,37 @@ def test_raw_daqmx_channel_export(lazy_load):
np.testing.assert_equal(dataframe["/'Group'/'Channel1'[1]"], expected_data[1])


@pytest.mark.parametrize('abs_time_index', [False, True])
def test_dataframe_with_arrow_types(abs_time_index):
test_file = GeneratedFile()
test_file.add_segment(*timed_segment())

tdms_data = test_file.load()

file_df = tdms_data.as_dataframe(
arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)

group_df = tdms_data['Group'].as_dataframe(
arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)

channel_df = tdms_data['Group']['Channel1'].as_dataframe(
arrow_dtypes=True, time_index=abs_time_index, absolute_time=abs_time_index)

assert len(file_df) == 2
assert "/'Group'/'Channel1'" in file_df.keys()
assert "/'Group'/'Channel2'" in file_df.keys()

def check_series(series):
assert (series == [1, 2]).all()
assert series.dtype == "int32[pyarrow]"
if abs_time_index:
assert series.index.dtype == "timestamp[ns][pyarrow]"

check_series(file_df["/'Group'/'Channel1'"])
check_series(group_df['Channel1'])
check_series(channel_df["/'Group'/'Channel1'"])


def test_export_with_empty_channels():
"""Convert a group to dataframe when a channel has empty data and void data type"""

Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ test =
scipy
pandas =
pandas
pyarrow
hdf =
h5py >= 2.10.0
thermocouple_scaling =
Expand Down

0 comments on commit c0744c4

Please sign in to comment.