add support for variable-length strings (#116)

* add support for variable-length strings * add unit test
JuliaIO · Jun 8, 2023 · 6be3259 · 6be3259 · meggart · Jun 8, 2023
1 parent 26f7ed0
commit 6be3259
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 22 deletions.
diff --git a/src/Filters.jl b/src/Filters.jl
@@ -27,6 +27,13 @@ Encodes and decodes variable-length arrays of arbitrary data type
 """
 struct VLenArrayFilter{T} <: Filter{T,UInt8} end
 
+"""
+    VLenUTF8Filter
+
+Encodes and decodes variable-length unicode strings
+"""
+struct VLenUTF8Filter <: Filter{String, UInt8} end
+
 function zdecode(ain, ::VLenArrayFilter{T}) where T
     f = IOBuffer(ain)
     nitems = read(f, UInt32)
@@ -51,8 +58,34 @@ function zencode(ain,::VLenArrayFilter)
     take!(b)
 end
 
+function zdecode(ain, ::VLenUTF8Filter)
+    f = IOBuffer(ain)
+    nitems = read(f, UInt32)
+    out = Array{String}(undef, nitems)
+    for i in 1:nitems
+        clen = read(f, UInt32)
+        out[i] = String(read(f, clen))
+    end
+    close(f)
+    out
+end
+
+function zencode(ain, ::VLenUTF8Filter)
+    b = IOBuffer()
+    nitems = length(ain)
+    write(b, UInt32(nitems))
+    for a in ain
+        utf8encoded = transcode(String, a)
+        write(b, UInt32(ncodeunits(utf8encoded)))
+        write(b, utf8encoded)
+    end
+    take!(b)
+end
+
 JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) )
+JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8")
 
 getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}()
+getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter()
 
-filterdict = Dict("vlen-array"=>VLenArrayFilter)
+filterdict = Dict("vlen-array"=>VLenArrayFilter, "vlen-utf8"=>VLenUTF8Filter)
diff --git a/src/ZArray.jl b/src/ZArray.jl
@@ -11,12 +11,13 @@ const concurrent_io_tasks = Ref(50)
 getfillval(::Type{T}, t::String) where {T <: Number} = parse(T, t)
 getfillval(::Type{T}, t::Union{T,Nothing}) where {T} = t
 
-struct SenMissArray{T,N,V} <: AbstractArray{Union{T,Missing},N}
+struct SenMissArray{T,N} <: AbstractArray{Union{T,Missing},N}
   x::Array{T,N}
+  senval::T
 end
-SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N,convert(T,v)}(x)
+SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N}(x,convert(T,v))
 Base.size(x::SenMissArray) = size(x.x)
-senval(x::SenMissArray{<:Any,<:Any,V}) where V = V
+senval(x::SenMissArray) = x.senval
 function Base.getindex(x::SenMissArray,i::Int)
   v = x.x[i]
   isequal(v,senval(x)) ? missing : v
@@ -78,6 +79,7 @@ storageratio(z::ZArray{<:Vector}) = "unknown"
 
 nobytes(z::ZArray) = length(z)*sizeof(eltype(z))
 nobytes(z::ZArray{<:Vector}) = "unknown"
+nobytes(z::ZArray{<:String}) = "unknown"
 
 zinfo(z::ZArray) = zinfo(stdout,z)
 function zinfo(io::IO,z::ZArray)
@@ -361,20 +363,12 @@ function filterfromtype(::Type{<:AbstractArray{T}}) where T
   (VLenArrayFilter{T}(),)
 end
 
+filterfromtype(::Type{<:Union{<:AbstractString, Union{<:AbstractString, Missing}}}) = (VLenUTF8Filter(),)
+filterfromtype(::Type{<:Union{MaxLengthString, Union{MaxLengthString, Missing}}}) = nothing
+
 #Not all Array types can be mapped directly to a valid ZArray encoding.
 #Here we try to determine the correct element type
 to_zarrtype(::AbstractArray{T}) where T = T
-function to_zarrtype(a::AbstractArray{<:Union{AbstractString,Missing}})
-  isasc, maxlen = mapreduce(
-  x->ismissing(x) ? (true,0) : (isascii(x),length(x)),
-  (x,y)->((x[1] && y[1]),max(x[2],y[2])),
-  a,
-  init = (true, 0,false)
-  )
-  et = isasc ? UInt8 : UInt32
-  newt = MaxLengthString{maxlen,et}
-  return eltype(a)>:Missing ? Union{newt,Missing} : newt
-end
 to_zarrtype(a::AbstractArray{<:Date}) = DateTime64{Dates.Day}
 to_zarrtype(a::AbstractArray{<:DateTime}) = DateTime64{Dates.Millisecond}
 

diff --git a/src/metadata.jl b/src/metadata.jl
@@ -45,6 +45,7 @@ Base.convert(::Type{DateTime64{P}}, t::Date) where P = DateTime64{P}(Dates.value
 Base.convert(::Type{DateTime64{P}}, t::DateTime) where P = DateTime64{P}(Dates.value(P(t-DateTime(1970))))
 Base.convert(::Type{DateTime64{P}}, t::DateTime64{Q}) where {P,Q} = DateTime64{P}(Dates.value(P(Q(t.i))))
 Base.zero(t::Union{DateTime64, Type{<:DateTime64}}) = t(0)
+Base.zero(t::Union{String, Type{String}}) = ""
 # Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{Date}) = Date 
 # Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{DateTime}) = DateTime
 # Base.promote_rule(::Type{<:DateTime64{<:Dates.TimePeriod}}, ::Type{Date}) = DateTime 
@@ -63,6 +64,7 @@ typestr(::Type{MaxLengthString{N,UInt32}}) where N = string('<', 'U', N)
 typestr(::Type{MaxLengthString{N,UInt8}}) where N = string('<', 'S', N)
 typestr(::Type{<:Array}) = "|O"
 typestr(::Type{<:DateTime64{P}}) where P = "<M8[$(pdt64string[P])]"
+typestr(::Type{<:AbstractString}) = "|O"
 
 const typestr_regex = r"^([<|>])([tbiufcmMOSUV])(\d*)(\[\w+\])?$"
 const typemap = Dict{Tuple{Char, Int}, DataType}(
@@ -96,7 +98,7 @@ function typestr(s::AbstractString, filterlist=nothing)
             if filterlist === nothing
                 throw(ArgumentError("Object array can only be parsed when an appropriate filter is defined"))
             end
-            return Vector{sourcetype(first(filterlist))}
+            return sourcetype(first(filterlist))
         end
         isempty(typesize) && throw((ArgumentError("$s is not a valid numpy typestr")))
         tc, ts = first(typecode), parse(Int, typesize)
@@ -243,4 +245,5 @@ Base.eltype(::Metadata{T}) where T = T
 fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v)
 fill_value_decoding(v::Nothing, ::Any) = v
 fill_value_decoding(v, T) = T(v)
+fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v])
 fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v
diff --git a/test/python.jl b/test/python.jl
@@ -27,7 +27,8 @@ dtypes = (UInt8, UInt16, UInt32, UInt64,
     Int8, Int16, Int32, Int64,
     Float16, Float32, Float64,
     Complex{Float32}, Complex{Float64},
-    Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32})
+    Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32},
+    String)
 compressors = (
     "no"=>NoCompressor(),
     "blosc"=>BloscCompressor(cname="zstd"),
@@ -63,7 +64,7 @@ gatts = g.attrs
 dtypesp = ("uint8","uint16","uint32","uint64",
     "int8","int16","int32","int64",
     "float16","float32","float64",
-    "complex64", "complex128","bool","S10","U10")
+    "complex64", "complex128","bool","S10","U10", "O")
 
 #Test accessing arrays from python and reading data
 for i=1:length(dtypes), co in compressors
@@ -115,13 +116,16 @@ end
 data = rand(Int32,2,6,10)
 py"""
 import numcodecs
+import numpy as np
 g = zarr.group($ppython)
 g.attrs["groupatt"] = "Hi"
 z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4')
 z1[:,:,:]=$data
 z1.attrs["test"]={"b": 6}
 z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib())
 z2[:]=[k for k in 'hallo']
+z3 = g.create_dataset('a3', shape=(2,), dtype=str)
+z3[:]=np.asarray(['test1', 'test234'], dtype='O')
 zarr.consolidate_metadata($ppython)
 """
 
@@ -135,6 +139,7 @@ a1 = g["a1"]
 @test a1.attrs["test"]==Dict("b"=>6)
 # Test reading the string array
 @test String(g["a2"][:])=="hallo"
+@test g["a3"] == ["test1", "test234"]
 
 # And test for consolidated metadata
 # Delete files so we make sure they are not accessed

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -218,13 +218,12 @@ end
 end
 
 @testset "string array getindex/setindex" begin
-  using Zarr: MaxLengthString
   aa = ["this", "is", "all ", "ascii"]
   bb = ["And" "Unicode"; "ματριξ" missing]
   a = ZArray(aa)
-  b = ZArray(bb, fill_value = MaxLengthString{7,UInt32}(""))
-  @test eltype(a) == MaxLengthString{5,UInt8}
-  @test eltype(b) == Union{MaxLengthString{7,UInt32},Missing}
+  b = ZArray(bb, fill_value = "")
+  @test eltype(a) == String
+  @test eltype(b) == Union{String,Missing}
   @test a[:] == ["this", "is", "all ", "ascii"]
   @test all(isequal.(b[:,:],["And" "Unicode"; "ματριξ" missing]))
 end