From 6be3259e8e5982bd41f0ce4e7e01eca832f13a87 Mon Sep 17 00:00:00 2001 From: ilia-kats Date: Thu, 8 Jun 2023 08:24:52 +0200 Subject: [PATCH] add support for variable-length strings (#116) * add support for variable-length strings * add unit test --- src/Filters.jl | 35 ++++++++++++++++++++++++++++++++++- src/ZArray.jl | 22 ++++++++-------------- src/metadata.jl | 5 ++++- test/python.jl | 9 +++++++-- test/runtests.jl | 7 +++---- 5 files changed, 56 insertions(+), 22 deletions(-) diff --git a/src/Filters.jl b/src/Filters.jl index 2666bdf..fde3db9 100644 --- a/src/Filters.jl +++ b/src/Filters.jl @@ -27,6 +27,13 @@ Encodes and decodes variable-length arrays of arbitrary data type """ struct VLenArrayFilter{T} <: Filter{T,UInt8} end +""" + VLenUTF8Filter + +Encodes and decodes variable-length unicode strings +""" +struct VLenUTF8Filter <: Filter{String, UInt8} end + function zdecode(ain, ::VLenArrayFilter{T}) where T f = IOBuffer(ain) nitems = read(f, UInt32) @@ -51,8 +58,34 @@ function zencode(ain,::VLenArrayFilter) take!(b) end +function zdecode(ain, ::VLenUTF8Filter) + f = IOBuffer(ain) + nitems = read(f, UInt32) + out = Array{String}(undef, nitems) + for i in 1:nitems + clen = read(f, UInt32) + out[i] = String(read(f, clen)) + end + close(f) + out +end + +function zencode(ain, ::VLenUTF8Filter) + b = IOBuffer() + nitems = length(ain) + write(b, UInt32(nitems)) + for a in ain + utf8encoded = transcode(String, a) + write(b, UInt32(ncodeunits(utf8encoded))) + write(b, utf8encoded) + end + take!(b) +end + JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) ) +JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8") getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}() +getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter() -filterdict = Dict("vlen-array"=>VLenArrayFilter) \ No newline at end of file +filterdict = Dict("vlen-array"=>VLenArrayFilter, "vlen-utf8"=>VLenUTF8Filter) diff --git a/src/ZArray.jl b/src/ZArray.jl index ee74215..6cc6c68 100644 --- a/src/ZArray.jl +++ b/src/ZArray.jl @@ -11,12 +11,13 @@ const concurrent_io_tasks = Ref(50) getfillval(::Type{T}, t::String) where {T <: Number} = parse(T, t) getfillval(::Type{T}, t::Union{T,Nothing}) where {T} = t -struct SenMissArray{T,N,V} <: AbstractArray{Union{T,Missing},N} +struct SenMissArray{T,N} <: AbstractArray{Union{T,Missing},N} x::Array{T,N} + senval::T end -SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N,convert(T,v)}(x) +SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N}(x,convert(T,v)) Base.size(x::SenMissArray) = size(x.x) -senval(x::SenMissArray{<:Any,<:Any,V}) where V = V +senval(x::SenMissArray) = x.senval function Base.getindex(x::SenMissArray,i::Int) v = x.x[i] isequal(v,senval(x)) ? missing : v @@ -78,6 +79,7 @@ storageratio(z::ZArray{<:Vector}) = "unknown" nobytes(z::ZArray) = length(z)*sizeof(eltype(z)) nobytes(z::ZArray{<:Vector}) = "unknown" +nobytes(z::ZArray{<:String}) = "unknown" zinfo(z::ZArray) = zinfo(stdout,z) function zinfo(io::IO,z::ZArray) @@ -361,20 +363,12 @@ function filterfromtype(::Type{<:AbstractArray{T}}) where T (VLenArrayFilter{T}(),) end +filterfromtype(::Type{<:Union{<:AbstractString, Union{<:AbstractString, Missing}}}) = (VLenUTF8Filter(),) +filterfromtype(::Type{<:Union{MaxLengthString, Union{MaxLengthString, Missing}}}) = nothing + #Not all Array types can be mapped directly to a valid ZArray encoding. #Here we try to determine the correct element type to_zarrtype(::AbstractArray{T}) where T = T -function to_zarrtype(a::AbstractArray{<:Union{AbstractString,Missing}}) - isasc, maxlen = mapreduce( - x->ismissing(x) ? (true,0) : (isascii(x),length(x)), - (x,y)->((x[1] && y[1]),max(x[2],y[2])), - a, - init = (true, 0,false) - ) - et = isasc ? UInt8 : UInt32 - newt = MaxLengthString{maxlen,et} - return eltype(a)>:Missing ? Union{newt,Missing} : newt -end to_zarrtype(a::AbstractArray{<:Date}) = DateTime64{Dates.Day} to_zarrtype(a::AbstractArray{<:DateTime}) = DateTime64{Dates.Millisecond} diff --git a/src/metadata.jl b/src/metadata.jl index ae50634..8ea78a3 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -45,6 +45,7 @@ Base.convert(::Type{DateTime64{P}}, t::Date) where P = DateTime64{P}(Dates.value Base.convert(::Type{DateTime64{P}}, t::DateTime) where P = DateTime64{P}(Dates.value(P(t-DateTime(1970)))) Base.convert(::Type{DateTime64{P}}, t::DateTime64{Q}) where {P,Q} = DateTime64{P}(Dates.value(P(Q(t.i)))) Base.zero(t::Union{DateTime64, Type{<:DateTime64}}) = t(0) +Base.zero(t::Union{String, Type{String}}) = "" # Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{Date}) = Date # Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{DateTime}) = DateTime # Base.promote_rule(::Type{<:DateTime64{<:Dates.TimePeriod}}, ::Type{Date}) = DateTime @@ -63,6 +64,7 @@ typestr(::Type{MaxLengthString{N,UInt32}}) where N = string('<', 'U', N) typestr(::Type{MaxLengthString{N,UInt8}}) where N = string('<', 'S', N) typestr(::Type{<:Array}) = "|O" typestr(::Type{<:DateTime64{P}}) where P = "])([tbiufcmMOSUV])(\d*)(\[\w+\])?$" const typemap = Dict{Tuple{Char, Int}, DataType}( @@ -96,7 +98,7 @@ function typestr(s::AbstractString, filterlist=nothing) if filterlist === nothing throw(ArgumentError("Object array can only be parsed when an appropriate filter is defined")) end - return Vector{sourcetype(first(filterlist))} + return sourcetype(first(filterlist)) end isempty(typesize) && throw((ArgumentError("$s is not a valid numpy typestr"))) tc, ts = first(typecode), parse(Int, typesize) @@ -243,4 +245,5 @@ Base.eltype(::Metadata{T}) where T = T fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) fill_value_decoding(v::Nothing, ::Any) = v fill_value_decoding(v, T) = T(v) +fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v]) fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v diff --git a/test/python.jl b/test/python.jl index 6ff15d7..e2568b0 100644 --- a/test/python.jl +++ b/test/python.jl @@ -27,7 +27,8 @@ dtypes = (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float16, Float32, Float64, Complex{Float32}, Complex{Float64}, - Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32}) + Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32}, + String) compressors = ( "no"=>NoCompressor(), "blosc"=>BloscCompressor(cname="zstd"), @@ -63,7 +64,7 @@ gatts = g.attrs dtypesp = ("uint8","uint16","uint32","uint64", "int8","int16","int32","int64", "float16","float32","float64", - "complex64", "complex128","bool","S10","U10") + "complex64", "complex128","bool","S10","U10", "O") #Test accessing arrays from python and reading data for i=1:length(dtypes), co in compressors @@ -115,6 +116,7 @@ end data = rand(Int32,2,6,10) py""" import numcodecs +import numpy as np g = zarr.group($ppython) g.attrs["groupatt"] = "Hi" z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4') @@ -122,6 +124,8 @@ z1[:,:,:]=$data z1.attrs["test"]={"b": 6} z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib()) z2[:]=[k for k in 'hallo'] +z3 = g.create_dataset('a3', shape=(2,), dtype=str) +z3[:]=np.asarray(['test1', 'test234'], dtype='O') zarr.consolidate_metadata($ppython) """ @@ -135,6 +139,7 @@ a1 = g["a1"] @test a1.attrs["test"]==Dict("b"=>6) # Test reading the string array @test String(g["a2"][:])=="hallo" +@test g["a3"] == ["test1", "test234"] # And test for consolidated metadata # Delete files so we make sure they are not accessed diff --git a/test/runtests.jl b/test/runtests.jl index 4716250..4c996e3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -218,13 +218,12 @@ end end @testset "string array getindex/setindex" begin - using Zarr: MaxLengthString aa = ["this", "is", "all ", "ascii"] bb = ["And" "Unicode"; "ματριξ" missing] a = ZArray(aa) - b = ZArray(bb, fill_value = MaxLengthString{7,UInt32}("")) - @test eltype(a) == MaxLengthString{5,UInt8} - @test eltype(b) == Union{MaxLengthString{7,UInt32},Missing} + b = ZArray(bb, fill_value = "") + @test eltype(a) == String + @test eltype(b) == Union{String,Missing} @test a[:] == ["this", "is", "all ", "ascii"] @test all(isequal.(b[:,:],["And" "Unicode"; "ματριξ" missing])) end