-
Notifications
You must be signed in to change notification settings - Fork 142
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
improve performance/ergonomics of reading compound datatypes #592
Changes from all commits
82a6ffc
349eaef
31ab8d2
4e3f6c9
dfe1667
d52ea72
8f14a29
733f4d0
6c8bc40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -465,13 +465,6 @@ end | |
==(a::HDF5ReferenceObj, b::HDF5ReferenceObj) = a.r == b.r | ||
hash(x::HDF5ReferenceObj, h::UInt) = hash(x.r, h) | ||
|
||
# Compound types | ||
struct HDF5Compound{N} | ||
data::NTuple{N,Any} | ||
membername::NTuple{N,String} | ||
membertype::NTuple{N,Type} | ||
end | ||
|
||
# Opaque types | ||
struct HDF5Opaque | ||
data | ||
|
@@ -482,9 +475,24 @@ end | |
struct EmptyArray{T} end | ||
|
||
# Stub types to encode fixed-size arrays for H5T_ARRAY | ||
struct FixedArray{T,D} end | ||
size(::Type{FixedArray{T,D}}) where {T,D} = D | ||
eltype(::Type{FixedArray{T,D}}) where {T,D} = T | ||
struct FixedArray{T,D,L} | ||
data::NTuple{L, T} | ||
end | ||
size(::Type{FixedArray{T,D,L}}) where {T,D,L} = D | ||
size(x::T) where T <: FixedArray = size(T) | ||
eltype(::Type{FixedArray{T,D,L}}) where {T,D,L} = T | ||
eltype(x::T) where T <: FixedArray = eltype(T) | ||
|
||
struct FixedString{N} | ||
data::NTuple{N, Cchar} | ||
end | ||
length(::Type{FixedString{N}}) where N = N | ||
|
||
struct VariableArray{T} | ||
len::Csize_t | ||
p::Ptr{Cvoid} | ||
end | ||
eltype(::Type{VariableArray{T}}) where T = T | ||
|
||
# VLEN objects | ||
struct HDF5Vlen{T} | ||
|
@@ -1459,73 +1467,45 @@ function getindex(parent::Union{HDF5File, HDF5Group, HDF5Dataset}, r::HDF5Refere | |
h5object(obj_id, parent) | ||
end | ||
|
||
# Helper for reading compound types | ||
function read_row(io::IO, membertype, membersize) | ||
row = Any[] | ||
for (dtype, dsize) in zip(membertype, membersize) | ||
if dtype === String | ||
push!(row, unpad(read!(io, Vector{UInt8}(undef,dsize)), H5T_STR_NULLPAD)) | ||
elseif dtype<:HDF5.FixedArray && eltype(dtype)<:HDF5BitsKind | ||
val = read!(io, Vector{eltype(dtype)}(undef,prod(size(dtype)))) | ||
push!(row, reshape(val, size(dtype))) | ||
elseif dtype<:HDF5BitsKind | ||
push!(row, read(io, dtype)) | ||
else | ||
# for other types, just store the raw bytes and let the user | ||
# decide what to do | ||
push!(row, read!(io, Vector{UInt8}(undef,dsize))) | ||
end | ||
end | ||
return (row...,) | ||
end | ||
# convert special types to native julia types | ||
normalize_types(x) = x | ||
normalize_types(x::NamedTuple{T}) where T = NamedTuple{T}(map(normalize_types, values(x))) | ||
normalize_types(x::Cstring) = unsafe_string(x) | ||
normalize_types(x::FixedString) = join(Char.(x.data)) | ||
normalize_types(x::FixedArray) = reshape(collect(x.data), size(x)...) | ||
normalize_types(x::VariableArray) = copy(unsafe_wrap(Array, convert(Ptr{eltype(x)}, x.p), x.len, own=false)) | ||
|
||
# Read compound type | ||
function read(obj::HDF5Dataset, T::Union{Type{Array{HDF5Compound{N}}},Type{HDF5Compound{N}}}) where {N} | ||
t = datatype(obj) | ||
local sz = 0; local n; | ||
local membername; local membertype; | ||
local memberoffset; local memberfiletype; local membersize; | ||
try | ||
memberfiletype = Vector{HDF5Datatype}(undef,N) | ||
membertype = Vector{Type}(undef,N) | ||
membername = Vector{String}(undef,N) | ||
memberoffset = Vector{UInt64}(undef,N) | ||
membersize = Vector{UInt32}(undef,N) | ||
for i = 1:N | ||
filetype = HDF5Datatype(h5t_get_member_type(t.id, i-1)) | ||
memberfiletype[i] = filetype | ||
membertype[i] = hdf5_to_julia_eltype(filetype) | ||
memberoffset[i] = sz | ||
membersize[i] = sizeof(filetype) | ||
sz += sizeof(filetype) | ||
membername[i] = h5t_get_member_name(t.id, i-1) | ||
end | ||
finally | ||
close(t) | ||
end | ||
# Build the "memory type" | ||
memtype_id = h5t_create(H5T_COMPOUND, sz) | ||
for i = 1:N | ||
h5t_insert(memtype_id, membername[i], memberoffset[i], memberfiletype[i].id) # FIXME strings | ||
end | ||
# Read the raw data | ||
buf = Vector{UInt8}(undef,length(obj)*sz) | ||
h5d_read(obj.id, memtype_id, H5S_ALL, H5S_ALL, obj.xfer, buf) | ||
|
||
# Convert to the appropriate data format using iobuffer | ||
iobuff = IOBuffer(buf) | ||
data = Any[] | ||
while !eof(iobuff) | ||
push!(data, read_row(iobuff, membertype, membersize)) | ||
end | ||
# convert HDF5Compound type parameters to tuples | ||
membername = (membername...,) | ||
membertype = (membertype...,) | ||
if T === HDF5Compound{N} | ||
return HDF5Compound(data[1], membername, membertype) | ||
else | ||
return [HDF5Compound(elem, membername, membertype) for elem in data] | ||
end | ||
do_normalize(::Type{T}) where T = false | ||
do_normalize(::Type{NamedTuple{T, U}}) where T where U = any(i -> do_normalize(fieldtype(U,i)), 1:fieldcount(U)) | ||
do_normalize(::Type{T}) where T <: Union{Cstring, FixedString, FixedArray, VariableArray} = true | ||
|
||
do_reclaim(::Type{T}) where T = false | ||
do_reclaim(::Type{NamedTuple{T, U}}) where T where U = any(i -> do_reclaim(fieldtype(U,i)), 1:fieldcount(U)) | ||
do_reclaim(::Type{T}) where T <: Union{Cstring, VariableArray} = true | ||
|
||
function read(dset::HDF5Dataset, T::Union{Type{Array{U}}, Type{U}}) where U <: NamedTuple | ||
filetype = HDF5.datatype(dset) | ||
memtype_id = HDF5.h5t_get_native_type(filetype.id) # padded layout in memory | ||
@assert sizeof(U) == HDF5.h5t_get_size(memtype_id) "Type sizes mismatch!" | ||
|
||
buf = Array{U}(undef, size(dset)) | ||
|
||
HDF5.h5d_read(dset.id, memtype_id, HDF5.H5S_ALL, HDF5.H5S_ALL, HDF5.H5P_DEFAULT, buf) | ||
out = do_normalize(U) ? normalize_types.(buf) : buf | ||
|
||
if do_reclaim(U) | ||
dspace = dataspace(dset) | ||
# NOTE I have seen this call fail but I cannot reproduce | ||
h5d_vlen_reclaim(memtype_id, dspace.id, H5P_DEFAULT, buf) | ||
end | ||
|
||
HDF5.h5t_close(memtype_id) | ||
|
||
if T <: NamedTuple | ||
return out[1] | ||
else | ||
return out | ||
end | ||
end | ||
|
||
# Read OPAQUE datasets and attributes | ||
|
@@ -2006,19 +1986,42 @@ function hdf5_to_julia_eltype(objtype) | |
super_id = h5t_get_super(objtype.id) | ||
T = HDF5Vlen{hdf5_to_julia_eltype(HDF5Datatype(super_id))} | ||
elseif class_id == H5T_COMPOUND | ||
N = Int(h5t_get_nmembers(objtype.id)) | ||
# check if should be interpreted as complex | ||
if COMPLEX_SUPPORT[] && N == 2 | ||
membernames = ntuple(N) do i | ||
h5t_get_member_name(objtype.id, i-1) | ||
end | ||
membertypes = ntuple(N) do i | ||
hdf5_to_julia_eltype(HDF5Datatype(h5t_get_member_type(objtype.id, i-1))) | ||
N = h5t_get_nmembers(objtype.id) | ||
|
||
membernames = ntuple(N) do i | ||
h5t_get_member_name(objtype.id, i-1) | ||
end | ||
|
||
membertypes = ntuple(N) do i | ||
dtype = HDF5Datatype(h5t_get_member_type(objtype.id, i-1)) | ||
ci = h5t_get_class(dtype.id) | ||
|
||
if ci == H5T_STRING | ||
if h5t_is_variable_str(dtype.id) | ||
return Cstring | ||
else | ||
n = h5t_get_size(dtype.id) | ||
return FixedString{Int(n)} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again I'm guessing it's fine not do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one is actually necessary since NTuple requires the first type parameter to be |
||
end | ||
elseif ci == H5T_VLEN | ||
superid = h5t_get_super(dtype.id) | ||
T = VariableArray{hdf5_to_julia_eltype(HDF5Datatype(superid))} | ||
else | ||
return hdf5_to_julia_eltype(dtype) | ||
end | ||
iscomplex = (membernames == COMPLEX_FIELD_NAMES[]) && (membertypes[1] == membertypes[2]) && (membertypes[1] <: HDF5.HDF5Scalar) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unrelated changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this part was refactored because now |
||
T = iscomplex ? Complex{membertypes[1]} : HDF5Compound{N} | ||
end | ||
|
||
# check if should be interpreted as complex | ||
iscomplex = COMPLEX_SUPPORT[] && | ||
N == 2 && | ||
(membernames == COMPLEX_FIELD_NAMES[]) && | ||
(membertypes[1] == membertypes[2]) && | ||
(membertypes[1] <: HDF5.HDF5Scalar) | ||
|
||
if iscomplex | ||
T = Complex{membertypes[1]} | ||
else | ||
T = HDF5Compound{N} | ||
T = NamedTuple{Symbol.(membernames), Tuple{membertypes...}} | ||
end | ||
elseif class_id == H5T_ARRAY | ||
T = hdf5array(objtype) | ||
|
@@ -2423,7 +2426,7 @@ function hdf5array(objtype) | |
eltyp = HDF5Datatype(h5t_get_super(objtype.id)) | ||
T = hdf5_to_julia_eltype(eltyp) | ||
dimsizes = ntuple(i -> Int(dims[nd-i+1]), nd) # reverse order | ||
FixedArray{T, dimsizes} | ||
FixedArray{T, dimsizes, prod(dimsizes)} | ||
end | ||
|
||
### Property manipulation ### | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
using Random, Test, HDF5 | ||
|
||
import HDF5.datatype | ||
import Base.unsafe_convert | ||
|
||
struct foo | ||
a::Float64 | ||
b::String | ||
c::String | ||
d::Array{ComplexF64,2} | ||
e::Array{Int64,1} | ||
end | ||
|
||
struct foo_hdf5 | ||
a::Float64 | ||
b::Cstring | ||
c::NTuple{10, Cchar} | ||
d::NTuple{9, ComplexF64} | ||
e::HDF5.Hvl_t | ||
end | ||
|
||
function unsafe_convert(::Type{foo_hdf5}, x::foo) | ||
foo_hdf5(x.a, | ||
Base.unsafe_convert(Cstring, x.b), | ||
ntuple(i -> x.c[i], length(x.c)), | ||
ntuple(i -> x.d[i], length(x.d)), | ||
HDF5.Hvl_t(length(x.e), pointer(x.e)) | ||
) | ||
end | ||
|
||
function datatype(::Type{foo_hdf5}) | ||
dtype = HDF5.h5t_create(HDF5.H5T_COMPOUND, sizeof(foo_hdf5)) | ||
HDF5.h5t_insert(dtype, "a", fieldoffset(foo_hdf5, 1), datatype(Float64)) | ||
|
||
vlenstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1) | ||
HDF5.h5t_set_size(vlenstr_dtype, HDF5.H5T_VARIABLE) | ||
HDF5.h5t_set_cset(vlenstr_dtype, HDF5.H5T_CSET_UTF8) | ||
HDF5.h5t_insert(dtype, "b", fieldoffset(foo_hdf5, 2), vlenstr_dtype) | ||
|
||
fixedstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1) | ||
HDF5.h5t_set_size(fixedstr_dtype, 10 * sizeof(Cchar)) | ||
HDF5.h5t_set_cset(fixedstr_dtype, HDF5.H5T_CSET_UTF8) | ||
HDF5.h5t_insert(dtype, "c", fieldoffset(foo_hdf5, 3), fixedstr_dtype) | ||
|
||
hsz = HDF5.Hsize[3,3] | ||
musm marked this conversation as resolved.
Show resolved
Hide resolved
|
||
array_dtype = HDF5.h5t_array_create(datatype(ComplexF64).id, 2, hsz) | ||
HDF5.h5t_insert(dtype, "d", fieldoffset(foo_hdf5, 4), array_dtype) | ||
|
||
vlen_dtype = HDF5.h5t_vlen_create(datatype(Int64)) | ||
HDF5.h5t_insert(dtype, "e", fieldoffset(foo_hdf5, 5), vlen_dtype) | ||
|
||
HDF5Datatype(dtype) | ||
end | ||
|
||
@testset "compound" begin | ||
N = 10 | ||
v = [foo(rand(), | ||
randstring(rand(10:100)), | ||
randstring(10), | ||
rand(ComplexF64, 3,3), | ||
rand(1:10, rand(10:100)) | ||
) | ||
for _ in 1:N] | ||
v_write = unsafe_convert.(foo_hdf5, v) | ||
|
||
fn = tempname() | ||
h5open(fn, "w") do h5f | ||
dtype = datatype(foo_hdf5) | ||
space = dataspace(v_write) | ||
dset = HDF5.h5d_create(h5f.id, "data", dtype.id, space.id) | ||
HDF5.h5d_write(dset, dtype.id, v_write) | ||
end | ||
|
||
v_read = h5read(fn, "data") | ||
for field in (:a, :b, :c, :d, :e) | ||
f = x -> getfield(x, field) | ||
@test f.(v) == f.(v_read) | ||
end | ||
|
||
T = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, Cstring}} | ||
TT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, T}} | ||
TTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TT}} | ||
TTTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TTT}} | ||
|
||
@test HDF5.do_reclaim(TTTT) == true | ||
@test HDF5.do_normalize(TTTT) == true | ||
|
||
T = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, HDF5.FixedArray}} | ||
TT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, T}} | ||
TTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TT}} | ||
TTTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TTT}} | ||
|
||
@test HDF5.do_reclaim(TTTT) == false | ||
@test HDF5.do_normalize(TTTT) == true | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how / where's this field being used ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is used so that the julia type you construct for an
H5T_ARRAY
is memory compatible and can be directly written into which is necessary for compound datatypes withH5T_ARRAY
members.