Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve performance/ergonomics of reading compound datatypes #592

Merged
merged 9 commits into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59"
Blosc = "a74b3585-a348-5f62-a45c-50e91977d574"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

[compat]
julia = "1.3"
Expand Down
179 changes: 91 additions & 88 deletions src/HDF5.jl
Original file line number Diff line number Diff line change
Expand Up @@ -465,13 +465,6 @@ end
==(a::HDF5ReferenceObj, b::HDF5ReferenceObj) = a.r == b.r
hash(x::HDF5ReferenceObj, h::UInt) = hash(x.r, h)

# Compound types
struct HDF5Compound{N}
data::NTuple{N,Any}
membername::NTuple{N,String}
membertype::NTuple{N,Type}
end

# Opaque types
struct HDF5Opaque
data
Expand All @@ -482,9 +475,24 @@ end
struct EmptyArray{T} end

# Stub types to encode fixed-size arrays for H5T_ARRAY
struct FixedArray{T,D} end
size(::Type{FixedArray{T,D}}) where {T,D} = D
eltype(::Type{FixedArray{T,D}}) where {T,D} = T
struct FixedArray{T,D,L}
data::NTuple{L, T}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how / where's this field being used ?

Copy link
Contributor Author

@kleinhenz kleinhenz Mar 2, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is used so that the julia type you construct for an H5T_ARRAY is memory compatible and can be directly written into which is necessary for compound datatypes with H5T_ARRAY members.

end
size(::Type{FixedArray{T,D,L}}) where {T,D,L} = D
size(x::T) where T <: FixedArray = size(T)
eltype(::Type{FixedArray{T,D,L}}) where {T,D,L} = T
eltype(x::T) where T <: FixedArray = eltype(T)

struct FixedString{N}
data::NTuple{N, Cchar}
end
length(::Type{FixedString{N}}) where N = N

struct VariableArray{T}
len::Csize_t
p::Ptr{Cvoid}
end
eltype(::Type{VariableArray{T}}) where T = T

# VLEN objects
struct HDF5Vlen{T}
Expand Down Expand Up @@ -1459,73 +1467,45 @@ function getindex(parent::Union{HDF5File, HDF5Group, HDF5Dataset}, r::HDF5Refere
h5object(obj_id, parent)
end

# Helper for reading compound types
function read_row(io::IO, membertype, membersize)
row = Any[]
for (dtype, dsize) in zip(membertype, membersize)
if dtype === String
push!(row, unpad(read!(io, Vector{UInt8}(undef,dsize)), H5T_STR_NULLPAD))
elseif dtype<:HDF5.FixedArray && eltype(dtype)<:HDF5BitsKind
val = read!(io, Vector{eltype(dtype)}(undef,prod(size(dtype))))
push!(row, reshape(val, size(dtype)))
elseif dtype<:HDF5BitsKind
push!(row, read(io, dtype))
else
# for other types, just store the raw bytes and let the user
# decide what to do
push!(row, read!(io, Vector{UInt8}(undef,dsize)))
end
end
return (row...,)
end
# convert special types to native julia types
normalize_types(x) = x
normalize_types(x::NamedTuple{T}) where T = NamedTuple{T}(map(normalize_types, values(x)))
normalize_types(x::Cstring) = unsafe_string(x)
normalize_types(x::FixedString) = join(Char.(x.data))
normalize_types(x::FixedArray) = reshape(collect(x.data), size(x)...)
normalize_types(x::VariableArray) = copy(unsafe_wrap(Array, convert(Ptr{eltype(x)}, x.p), x.len, own=false))

# Read compound type
function read(obj::HDF5Dataset, T::Union{Type{Array{HDF5Compound{N}}},Type{HDF5Compound{N}}}) where {N}
t = datatype(obj)
local sz = 0; local n;
local membername; local membertype;
local memberoffset; local memberfiletype; local membersize;
try
memberfiletype = Vector{HDF5Datatype}(undef,N)
membertype = Vector{Type}(undef,N)
membername = Vector{String}(undef,N)
memberoffset = Vector{UInt64}(undef,N)
membersize = Vector{UInt32}(undef,N)
for i = 1:N
filetype = HDF5Datatype(h5t_get_member_type(t.id, i-1))
memberfiletype[i] = filetype
membertype[i] = hdf5_to_julia_eltype(filetype)
memberoffset[i] = sz
membersize[i] = sizeof(filetype)
sz += sizeof(filetype)
membername[i] = h5t_get_member_name(t.id, i-1)
end
finally
close(t)
end
# Build the "memory type"
memtype_id = h5t_create(H5T_COMPOUND, sz)
for i = 1:N
h5t_insert(memtype_id, membername[i], memberoffset[i], memberfiletype[i].id) # FIXME strings
end
# Read the raw data
buf = Vector{UInt8}(undef,length(obj)*sz)
h5d_read(obj.id, memtype_id, H5S_ALL, H5S_ALL, obj.xfer, buf)

# Convert to the appropriate data format using iobuffer
iobuff = IOBuffer(buf)
data = Any[]
while !eof(iobuff)
push!(data, read_row(iobuff, membertype, membersize))
end
# convert HDF5Compound type parameters to tuples
membername = (membername...,)
membertype = (membertype...,)
if T === HDF5Compound{N}
return HDF5Compound(data[1], membername, membertype)
else
return [HDF5Compound(elem, membername, membertype) for elem in data]
end
do_normalize(::Type{T}) where T = false
do_normalize(::Type{NamedTuple{T, U}}) where T where U = any(i -> do_normalize(fieldtype(U,i)), 1:fieldcount(U))
do_normalize(::Type{T}) where T <: Union{Cstring, FixedString, FixedArray, VariableArray} = true

do_reclaim(::Type{T}) where T = false
do_reclaim(::Type{NamedTuple{T, U}}) where T where U = any(i -> do_reclaim(fieldtype(U,i)), 1:fieldcount(U))
do_reclaim(::Type{T}) where T <: Union{Cstring, VariableArray} = true

function read(dset::HDF5Dataset, T::Union{Type{Array{U}}, Type{U}}) where U <: NamedTuple
filetype = HDF5.datatype(dset)
memtype_id = HDF5.h5t_get_native_type(filetype.id) # padded layout in memory
@assert sizeof(U) == HDF5.h5t_get_size(memtype_id) "Type sizes mismatch!"

buf = Array{U}(undef, size(dset))

HDF5.h5d_read(dset.id, memtype_id, HDF5.H5S_ALL, HDF5.H5S_ALL, HDF5.H5P_DEFAULT, buf)
out = do_normalize(U) ? normalize_types.(buf) : buf

if do_reclaim(U)
dspace = dataspace(dset)
# NOTE I have seen this call fail but I cannot reproduce
h5d_vlen_reclaim(memtype_id, dspace.id, H5P_DEFAULT, buf)
end

HDF5.h5t_close(memtype_id)

if T <: NamedTuple
return out[1]
else
return out
end
end

# Read OPAQUE datasets and attributes
Expand Down Expand Up @@ -2006,19 +1986,42 @@ function hdf5_to_julia_eltype(objtype)
super_id = h5t_get_super(objtype.id)
T = HDF5Vlen{hdf5_to_julia_eltype(HDF5Datatype(super_id))}
elseif class_id == H5T_COMPOUND
N = Int(h5t_get_nmembers(objtype.id))
# check if should be interpreted as complex
if COMPLEX_SUPPORT[] && N == 2
membernames = ntuple(N) do i
h5t_get_member_name(objtype.id, i-1)
end
membertypes = ntuple(N) do i
hdf5_to_julia_eltype(HDF5Datatype(h5t_get_member_type(objtype.id, i-1)))
N = h5t_get_nmembers(objtype.id)

membernames = ntuple(N) do i
h5t_get_member_name(objtype.id, i-1)
end

membertypes = ntuple(N) do i
dtype = HDF5Datatype(h5t_get_member_type(objtype.id, i-1))
ci = h5t_get_class(dtype.id)

if ci == H5T_STRING
if h5t_is_variable_str(dtype.id)
return Cstring
else
n = h5t_get_size(dtype.id)
return FixedString{Int(n)}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again I'm guessing it's fine not do Int here as well. Unless you foresee negative consequences.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is actually necessary since NTuple requires the first type parameter to be Int64.

end
elseif ci == H5T_VLEN
superid = h5t_get_super(dtype.id)
T = VariableArray{hdf5_to_julia_eltype(HDF5Datatype(superid))}
else
return hdf5_to_julia_eltype(dtype)
end
iscomplex = (membernames == COMPLEX_FIELD_NAMES[]) && (membertypes[1] == membertypes[2]) && (membertypes[1] <: HDF5.HDF5Scalar)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated changes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this part was refactored because now membernames/membertypes are used outside of of the COMPLEX_SUPPORT path.

T = iscomplex ? Complex{membertypes[1]} : HDF5Compound{N}
end

# check if should be interpreted as complex
iscomplex = COMPLEX_SUPPORT[] &&
N == 2 &&
(membernames == COMPLEX_FIELD_NAMES[]) &&
(membertypes[1] == membertypes[2]) &&
(membertypes[1] <: HDF5.HDF5Scalar)

if iscomplex
T = Complex{membertypes[1]}
else
T = HDF5Compound{N}
T = NamedTuple{Symbol.(membernames), Tuple{membertypes...}}
end
elseif class_id == H5T_ARRAY
T = hdf5array(objtype)
Expand Down Expand Up @@ -2423,7 +2426,7 @@ function hdf5array(objtype)
eltyp = HDF5Datatype(h5t_get_super(objtype.id))
T = hdf5_to_julia_eltype(eltyp)
dimsizes = ntuple(i -> Int(dims[nd-i+1]), nd) # reverse order
FixedArray{T, dimsizes}
FixedArray{T, dimsizes, prod(dimsizes)}
end

### Property manipulation ###
Expand Down
95 changes: 95 additions & 0 deletions test/compound.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
using Random, Test, HDF5

import HDF5.datatype
import Base.unsafe_convert

struct foo
a::Float64
b::String
c::String
d::Array{ComplexF64,2}
e::Array{Int64,1}
end

struct foo_hdf5
a::Float64
b::Cstring
c::NTuple{10, Cchar}
d::NTuple{9, ComplexF64}
e::HDF5.Hvl_t
end

function unsafe_convert(::Type{foo_hdf5}, x::foo)
foo_hdf5(x.a,
Base.unsafe_convert(Cstring, x.b),
ntuple(i -> x.c[i], length(x.c)),
ntuple(i -> x.d[i], length(x.d)),
HDF5.Hvl_t(length(x.e), pointer(x.e))
)
end

function datatype(::Type{foo_hdf5})
dtype = HDF5.h5t_create(HDF5.H5T_COMPOUND, sizeof(foo_hdf5))
HDF5.h5t_insert(dtype, "a", fieldoffset(foo_hdf5, 1), datatype(Float64))

vlenstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1)
HDF5.h5t_set_size(vlenstr_dtype, HDF5.H5T_VARIABLE)
HDF5.h5t_set_cset(vlenstr_dtype, HDF5.H5T_CSET_UTF8)
HDF5.h5t_insert(dtype, "b", fieldoffset(foo_hdf5, 2), vlenstr_dtype)

fixedstr_dtype = HDF5.h5t_copy(HDF5.H5T_C_S1)
HDF5.h5t_set_size(fixedstr_dtype, 10 * sizeof(Cchar))
HDF5.h5t_set_cset(fixedstr_dtype, HDF5.H5T_CSET_UTF8)
HDF5.h5t_insert(dtype, "c", fieldoffset(foo_hdf5, 3), fixedstr_dtype)

hsz = HDF5.Hsize[3,3]
musm marked this conversation as resolved.
Show resolved Hide resolved
array_dtype = HDF5.h5t_array_create(datatype(ComplexF64).id, 2, hsz)
HDF5.h5t_insert(dtype, "d", fieldoffset(foo_hdf5, 4), array_dtype)

vlen_dtype = HDF5.h5t_vlen_create(datatype(Int64))
HDF5.h5t_insert(dtype, "e", fieldoffset(foo_hdf5, 5), vlen_dtype)

HDF5Datatype(dtype)
end

@testset "compound" begin
N = 10
v = [foo(rand(),
randstring(rand(10:100)),
randstring(10),
rand(ComplexF64, 3,3),
rand(1:10, rand(10:100))
)
for _ in 1:N]
v_write = unsafe_convert.(foo_hdf5, v)

fn = tempname()
h5open(fn, "w") do h5f
dtype = datatype(foo_hdf5)
space = dataspace(v_write)
dset = HDF5.h5d_create(h5f.id, "data", dtype.id, space.id)
HDF5.h5d_write(dset, dtype.id, v_write)
end

v_read = h5read(fn, "data")
for field in (:a, :b, :c, :d, :e)
f = x -> getfield(x, field)
@test f.(v) == f.(v_read)
end

T = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, Cstring}}
TT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, T}}
TTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TT}}
TTTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TTT}}

@test HDF5.do_reclaim(TTTT) == true
@test HDF5.do_normalize(TTTT) == true

T = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, HDF5.FixedArray}}
TT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, T}}
TTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TT}}
TTTT = NamedTuple{(:a, :b, :c, :d, :e, :f), Tuple{Int, Int, Int, Int, Int, TTT}}

@test HDF5.do_reclaim(TTTT) == false
@test HDF5.do_normalize(TTTT) == true
end
13 changes: 4 additions & 9 deletions test/plain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -320,12 +320,7 @@ rm(tmpdir, recursive=true)
test_files = joinpath(@__DIR__, "test_files")

d = h5read(joinpath(test_files, "compound.h5"), "/data")
@test typeof(d[1]) === HDF5.HDF5Compound{4}
@test length(d) == 2
dtypes = [typeof(x) for x in d[1].data]
@test dtypes == [Float64, Vector{Float64}, Vector{Float64}, Float64]
@test length(d[1].data[2]) == 3
@test d[1].membername == ("wgt", "xyz", "uvw", "E")
@test typeof(d[1]) == NamedTuple{(:wgt, :xyz, :uvw, :E), Tuple{Float64, Array{Float64, 1}, Array{Float64, 1}, Float64}}

# get-datasets
fn = tempname()
Expand Down Expand Up @@ -450,12 +445,12 @@ end # testset plain

HDF5.disable_complex_support()
z = read(fr, "ComplexF64")
@test isa(z, HDF5.HDF5Compound{2})
@test isa(z, NamedTuple{(:r, :i), Tuple{Float64, Float64}})

Acmplx32 = read(fr, "Acmplx32")
@test eltype(Acmplx32) == HDF5.HDF5Compound{2}
@test eltype(Acmplx32) == NamedTuple{(:r, :i), Tuple{Float32, Float32}}
Acmplx64 = read(fr, "Acmplx64")
@test eltype(Acmplx64) == HDF5.HDF5Compound{2}
@test eltype(Acmplx64) == NamedTuple{(:r, :i), Tuple{Float64, Float64}}

close(fr)

Expand Down
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ using Pkg
println("HDF5 version ", HDF5.h5_get_libversion())

include("plain.jl")
include("compound.jl")
include("readremote.jl")
include("extend_test.jl")
include("gc.jl")
Expand Down