Skip to content

Commit

Permalink
add support for variable-length strings (#116)
Browse files Browse the repository at this point in the history
* add support for variable-length strings

* add unit test
  • Loading branch information
ilia-kats authored Jun 8, 2023
1 parent 26f7ed0 commit 6be3259
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 22 deletions.
35 changes: 34 additions & 1 deletion src/Filters.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ Encodes and decodes variable-length arrays of arbitrary data type
"""
struct VLenArrayFilter{T} <: Filter{T,UInt8} end

"""
VLenUTF8Filter
Encodes and decodes variable-length unicode strings
"""
struct VLenUTF8Filter <: Filter{String, UInt8} end

function zdecode(ain, ::VLenArrayFilter{T}) where T
f = IOBuffer(ain)
nitems = read(f, UInt32)
Expand All @@ -51,8 +58,34 @@ function zencode(ain,::VLenArrayFilter)
take!(b)
end

function zdecode(ain, ::VLenUTF8Filter)
f = IOBuffer(ain)
nitems = read(f, UInt32)
out = Array{String}(undef, nitems)
for i in 1:nitems
clen = read(f, UInt32)
out[i] = String(read(f, clen))
end
close(f)
out
end

function zencode(ain, ::VLenUTF8Filter)
b = IOBuffer()
nitems = length(ain)
write(b, UInt32(nitems))
for a in ain
utf8encoded = transcode(String, a)
write(b, UInt32(ncodeunits(utf8encoded)))
write(b, utf8encoded)
end
take!(b)
end

JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) )
JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8")

getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}()
getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter()

filterdict = Dict("vlen-array"=>VLenArrayFilter)
filterdict = Dict("vlen-array"=>VLenArrayFilter, "vlen-utf8"=>VLenUTF8Filter)
22 changes: 8 additions & 14 deletions src/ZArray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ const concurrent_io_tasks = Ref(50)
getfillval(::Type{T}, t::String) where {T <: Number} = parse(T, t)
getfillval(::Type{T}, t::Union{T,Nothing}) where {T} = t

struct SenMissArray{T,N,V} <: AbstractArray{Union{T,Missing},N}
struct SenMissArray{T,N} <: AbstractArray{Union{T,Missing},N}
x::Array{T,N}
senval::T
end
SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N,convert(T,v)}(x)
SenMissArray(x::Array{T,N},v) where {T,N} = SenMissArray{T,N}(x,convert(T,v))
Base.size(x::SenMissArray) = size(x.x)
senval(x::SenMissArray{<:Any,<:Any,V}) where V = V
senval(x::SenMissArray) = x.senval
function Base.getindex(x::SenMissArray,i::Int)
v = x.x[i]
isequal(v,senval(x)) ? missing : v
Expand Down Expand Up @@ -78,6 +79,7 @@ storageratio(z::ZArray{<:Vector}) = "unknown"

nobytes(z::ZArray) = length(z)*sizeof(eltype(z))
nobytes(z::ZArray{<:Vector}) = "unknown"
nobytes(z::ZArray{<:String}) = "unknown"

zinfo(z::ZArray) = zinfo(stdout,z)
function zinfo(io::IO,z::ZArray)
Expand Down Expand Up @@ -361,20 +363,12 @@ function filterfromtype(::Type{<:AbstractArray{T}}) where T
(VLenArrayFilter{T}(),)
end

filterfromtype(::Type{<:Union{<:AbstractString, Union{<:AbstractString, Missing}}}) = (VLenUTF8Filter(),)
filterfromtype(::Type{<:Union{MaxLengthString, Union{MaxLengthString, Missing}}}) = nothing

#Not all Array types can be mapped directly to a valid ZArray encoding.
#Here we try to determine the correct element type
to_zarrtype(::AbstractArray{T}) where T = T
function to_zarrtype(a::AbstractArray{<:Union{AbstractString,Missing}})
isasc, maxlen = mapreduce(
x->ismissing(x) ? (true,0) : (isascii(x),length(x)),
(x,y)->((x[1] && y[1]),max(x[2],y[2])),
a,
init = (true, 0,false)
)
et = isasc ? UInt8 : UInt32
newt = MaxLengthString{maxlen,et}
return eltype(a)>:Missing ? Union{newt,Missing} : newt
end
to_zarrtype(a::AbstractArray{<:Date}) = DateTime64{Dates.Day}
to_zarrtype(a::AbstractArray{<:DateTime}) = DateTime64{Dates.Millisecond}

Expand Down
5 changes: 4 additions & 1 deletion src/metadata.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ Base.convert(::Type{DateTime64{P}}, t::Date) where P = DateTime64{P}(Dates.value
Base.convert(::Type{DateTime64{P}}, t::DateTime) where P = DateTime64{P}(Dates.value(P(t-DateTime(1970))))
Base.convert(::Type{DateTime64{P}}, t::DateTime64{Q}) where {P,Q} = DateTime64{P}(Dates.value(P(Q(t.i))))
Base.zero(t::Union{DateTime64, Type{<:DateTime64}}) = t(0)
Base.zero(t::Union{String, Type{String}}) = ""
# Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{Date}) = Date
# Base.promote_rule(::Type{<:DateTime64{<:Dates.DatePeriod}}, ::Type{DateTime}) = DateTime
# Base.promote_rule(::Type{<:DateTime64{<:Dates.TimePeriod}}, ::Type{Date}) = DateTime
Expand All @@ -63,6 +64,7 @@ typestr(::Type{MaxLengthString{N,UInt32}}) where N = string('<', 'U', N)
typestr(::Type{MaxLengthString{N,UInt8}}) where N = string('<', 'S', N)
typestr(::Type{<:Array}) = "|O"
typestr(::Type{<:DateTime64{P}}) where P = "<M8[$(pdt64string[P])]"
typestr(::Type{<:AbstractString}) = "|O"

const typestr_regex = r"^([<|>])([tbiufcmMOSUV])(\d*)(\[\w+\])?$"
const typemap = Dict{Tuple{Char, Int}, DataType}(
Expand Down Expand Up @@ -96,7 +98,7 @@ function typestr(s::AbstractString, filterlist=nothing)
if filterlist === nothing
throw(ArgumentError("Object array can only be parsed when an appropriate filter is defined"))
end
return Vector{sourcetype(first(filterlist))}
return sourcetype(first(filterlist))
end
isempty(typesize) && throw((ArgumentError("$s is not a valid numpy typestr")))
tc, ts = first(typecode), parse(Int, typesize)
Expand Down Expand Up @@ -243,4 +245,5 @@ Base.eltype(::Metadata{T}) where T = T
fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v)
fill_value_decoding(v::Nothing, ::Any) = v
fill_value_decoding(v, T) = T(v)
fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v])
fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v
9 changes: 7 additions & 2 deletions test/python.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ dtypes = (UInt8, UInt16, UInt32, UInt64,
Int8, Int16, Int32, Int64,
Float16, Float32, Float64,
Complex{Float32}, Complex{Float64},
Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32})
Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32},
String)
compressors = (
"no"=>NoCompressor(),
"blosc"=>BloscCompressor(cname="zstd"),
Expand Down Expand Up @@ -63,7 +64,7 @@ gatts = g.attrs
dtypesp = ("uint8","uint16","uint32","uint64",
"int8","int16","int32","int64",
"float16","float32","float64",
"complex64", "complex128","bool","S10","U10")
"complex64", "complex128","bool","S10","U10", "O")

#Test accessing arrays from python and reading data
for i=1:length(dtypes), co in compressors
Expand Down Expand Up @@ -115,13 +116,16 @@ end
data = rand(Int32,2,6,10)
py"""
import numcodecs
import numpy as np
g = zarr.group($ppython)
g.attrs["groupatt"] = "Hi"
z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4')
z1[:,:,:]=$data
z1.attrs["test"]={"b": 6}
z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib())
z2[:]=[k for k in 'hallo']
z3 = g.create_dataset('a3', shape=(2,), dtype=str)
z3[:]=np.asarray(['test1', 'test234'], dtype='O')
zarr.consolidate_metadata($ppython)
"""

Expand All @@ -135,6 +139,7 @@ a1 = g["a1"]
@test a1.attrs["test"]==Dict("b"=>6)
# Test reading the string array
@test String(g["a2"][:])=="hallo"
@test g["a3"] == ["test1", "test234"]

# And test for consolidated metadata
# Delete files so we make sure they are not accessed
Expand Down
7 changes: 3 additions & 4 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -218,13 +218,12 @@ end
end

@testset "string array getindex/setindex" begin
using Zarr: MaxLengthString
aa = ["this", "is", "all ", "ascii"]
bb = ["And" "Unicode"; "ματριξ" missing]
a = ZArray(aa)
b = ZArray(bb, fill_value = MaxLengthString{7,UInt32}(""))
@test eltype(a) == MaxLengthString{5,UInt8}
@test eltype(b) == Union{MaxLengthString{7,UInt32},Missing}
b = ZArray(bb, fill_value = "")
@test eltype(a) == String
@test eltype(b) == Union{String,Missing}
@test a[:] == ["this", "is", "all ", "ascii"]
@test all(isequal.(b[:,:],["And" "Unicode"; "ματριξ" missing]))
end
Expand Down

2 comments on commit 6be3259

@meggart
Copy link
Collaborator

@meggart meggart commented on 6be3259 Jun 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/85113

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.9.0 -m "<description of version>" 6be3259e8e5982bd41f0ce4e7e01eca832f13a87
git push origin v0.9.0

Please sign in to comment.