diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl new file mode 100644 index 0000000..829f31f --- /dev/null +++ b/src/Filters/Filters.jl @@ -0,0 +1,95 @@ +import JSON + +""" + abstract type Filter{T,TENC} + +The supertype for all Zarr filters. + +## Interface + +All subtypes MUST implement the following methods: + +- [`zencode(ain, filter::Filter)`](@ref zencode): Encodes data `ain` using the filter, and returns a vector of bytes. +- [`zdecode(ain, filter::Filter)`](@ref zdecode): Decodes data `ain`, a vector of bytes, using the filter, and returns the original data. +- [`JSON.lower`](@ref): Returns a JSON-serializable dictionary representing the filter, according to the Zarr specification. +- [`getfilter(::Type{<: Filter}, filterdict)`](@ref getfilter): Returns the filter type read from a given filter dictionary. + +If the filter has type parameters, it MUST also implement: +- [`sourcetype(::Filter)::T`](@ref sourcetype): equivalent to `dtype` in the Python Zarr implementation. +- [`desttype(::Filter)::T`](@ref desttype): equivalent to `atype` in the Python Zarr implementation. + +Finally, an entry MUST be added to the `filterdict` dictionary for each filter type. +This must also follow the Zarr specification's name for that filter. The name of the filter +is the key, and the value is the filter type (e.g. `VLenUInt8Filter` or `Fletcher32Filter`). + + +Subtypes include: [`VLenArrayFilter`](@ref), [`VLenUTF8Filter`](@ref), [`Fletcher32Filter`](@ref). +""" +abstract type Filter{T,TENC} end + +""" + zencode(ain, filter::Filter) + +Encodes data `ain` using the filter, and returns a vector of bytes. +""" +function zencode end + +""" + zdecode(ain, filter::Filter) + +Decodes data `ain`, a vector of bytes, using the filter, and returns the original data. +""" +function zdecode end + +""" + getfilter(::Type{<: Filter}, filterdict) + +Returns the filter type read from a given specification dictionary, which must follow the Zarr specification. +""" +function getfilter end + +""" + sourcetype(::Filter)::T + +Returns the source type of the filter. +""" +function sourcetype end + +""" + desttype(::Filter)::T + +Returns the destination type of the filter. +""" +function desttype end + +filterdict = Dict{String,Type{<:Filter}}() + +function getfilters(d::Dict) + if !haskey(d,"filters") + return nothing + else + if d["filters"] === nothing || isempty(d["filters"]) + return nothing + end + f = map(d["filters"]) do f + try + getfilter(filterdict[f["id"]], f) + catch e + @show f + rethrow(e) + end + end + return (f...,) + end +end +sourcetype(::Filter{T}) where T = T +desttype(::Filter{<:Any,T}) where T = T + +zencode(ain,::Nothing) = ain + +include("vlenfilters.jl") +include("fletcher32.jl") +include("fixedscaleoffset.jl") +include("shuffle.jl") +include("quantize.jl") +include("delta.jl") diff --git a/src/Filters/delta.jl b/src/Filters/delta.jl new file mode 100644 index 0000000..9d1de04 --- /dev/null +++ b/src/Filters/delta.jl @@ -0,0 +1,45 @@ +#= +# Delta compression + + +=# + +""" + DeltaFilter(; DecodingType, [EncodingType = DecodingType]) + +Delta-based compression for Zarr arrays. (Delta encoding is Julia `diff`, decoding is Julia `cumsum`). +""" +struct DeltaFilter{T, TENC} <: Filter{T, TENC} +end + +function DeltaFilter(; DecodingType = Float16, EncodingType = DecodingType) + return DeltaFilter{DecodingType, EncodingType}() +end + +DeltaFilter{T}() where T = DeltaFilter{T, T}() + +function zencode(data::AbstractArray, filter::DeltaFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + arr = reinterpret(DecodingType, vec(data)) + + enc = similar(arr, EncodingType) + # perform the delta operation + enc[begin] = arr[begin] + enc[begin+1:end] .= diff(arr) + return enc +end + +function zdecode(data::AbstractArray, filter::DeltaFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + encoded = reinterpret(EncodingType, vec(data)) + decoded = DecodingType.(cumsum(encoded)) + return decoded +end + +function JSON.lower(filter::DeltaFilter{T, Tenc}) where {T, Tenc} + return Dict("id" => "delta", "dtype" => typestr(T), "astype" => typestr(Tenc)) +end + +function getfilter(::Type{<: DeltaFilter}, d) + return DeltaFilter{typestr(d["dtype"], haskey(d, "astype") ? typestr(d["astype"]) : d["dtype"])}() +end + +filterdict["delta"] = DeltaFilter \ No newline at end of file diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl new file mode 100644 index 0000000..9e12c52 --- /dev/null +++ b/src/Filters/fixedscaleoffset.jl @@ -0,0 +1,52 @@ + +""" + FixedScaleOffsetFilter{T,TENC}(scale, offset) + +A compressor that scales and offsets the data. + +!!! note + The geographic CF standards define scale/offset decoding as `x * scale + offset`, + but this filter defines it as `x / scale + offset`. Constructing a `FixedScaleOffsetFilter` + from CF data means `FixedScaleOffsetFilter(1/cf_scale_factor, cf_add_offset)`. +""" +struct FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc} <: Filter{T, Tenc} + scale::ScaleOffsetType + offset::ScaleOffsetType +end + +FixedScaleOffsetFilter{T}(scale::ScaleOffsetType, offset::ScaleOffsetType) where {T, ScaleOffsetType} = FixedScaleOffsetFilter{T, ScaleOffsetType}(scale, offset) +FixedScaleOffsetFilter(scale::ScaleOffsetType, offset::ScaleOffsetType) where {ScaleOffsetType} = FixedScaleOffsetFilter{ScaleOffsetType, ScaleOffsetType}(scale, offset) + +function FixedScaleOffsetFilter(; scale::ScaleOffsetType, offset::ScaleOffsetType, T, Tenc = T) where ScaleOffsetType + return FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}(scale, offset) +end + +function zencode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} + if Tenc <: Integer + return [round(Tenc, (a - c.offset) * c.scale) for a in a] # apply scale and offset, and round to nearest integer + else + return [convert(Tenc, (a - c.offset) * c.scale) for a in a] # apply scale and offset + end +end + +function zdecode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} + return [convert(Base.nonmissingtype(T), (a / c.scale) + c.offset) for a in a] +end + + +function getfilter(::Type{<: FixedScaleOffsetFilter}, d::Dict) + scale = d["scale"] + offset = d["offset"] + # Types must be converted from strings to the actual Julia types they represent. + string_T = d["dtype"] + string_Tenc = get(d, "astype", string_T) + T = typestr(string_T) + Tenc = typestr(string_Tenc) + return FixedScaleOffsetFilter{Tenc, T, Tenc}(scale, offset) +end + +function JSON.lower(c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {ScaleOffsetType, T, Tenc} + return Dict("id" => "fixedscaleoffset", "scale" => c.scale, "offset" => c.offset, "dtype" => typestr(T), "astype" => typestr(Tenc)) +end + +filterdict["fixedscaleoffset"] = FixedScaleOffsetFilter diff --git a/src/Filters/fletcher32.jl b/src/Filters/fletcher32.jl new file mode 100644 index 0000000..d854cb9 --- /dev/null +++ b/src/Filters/fletcher32.jl @@ -0,0 +1,85 @@ +#= +# Fletcher32 filter + +This "filter" basically injects a 4-byte checksum at the end of the data, to ensure data integrity. + +The implementation is based on the [numcodecs implementation here](https://github.com/zarr-developers/numcodecs/blob/79d1a8d4f9c89d3513836aba0758e0d2a2a1cfaf/numcodecs/fletcher32.pyx) +and the [original C implementation for NetCDF](https://github.com/Unidata/netcdf-c/blob/main/plugins/H5checksum.c#L109) linked therein. + +=# + +""" + Fletcher32Filter() + +A compressor that uses the Fletcher32 checksum algorithm to compress and uncompress data. + +Note that this goes from UInt8 to UInt8, and is effectively only checking +the checksum and cropping the last 4 bytes of the data during decoding. +""" +struct Fletcher32Filter <: Filter{UInt8, UInt8} +end + +getfilter(::Type{<: Fletcher32Filter}, d::Dict) = Fletcher32Filter() +JSON.lower(::Fletcher32Filter) = Dict("id" => "fletcher32") +filterdict["fletcher32"] = Fletcher32Filter + +function _checksum_fletcher32(data::AbstractArray{UInt8}) + len = length(data) ÷ 2 # length in 16-bit words + sum1::UInt32 = 0 + sum2::UInt32 = 0 + data_idx = 1 + + #= + Compute the checksum for pairs of bytes. + The magic `360` value is the largest number of sums that can be performed without overflow in UInt32. + =# + while len > 0 + tlen = len > 360 ? 360 : len + len -= tlen + while tlen > 0 + sum1 += begin # create a 16 bit word from two bytes, the first one shifted to the end of the word + (UInt16(data[data_idx]) << 8) | UInt16(data[data_idx + 1]) + end + sum2 += sum1 + data_idx += 2 + tlen -= 1 + if tlen < 1 + break + end + end + sum1 = (sum1 & 0xffff) + (sum1 >> 16) + sum2 = (sum2 & 0xffff) + (sum2 >> 16) + end + + # if the length of the data is odd, add the first byte to the checksum again (?!) + if length(data) % 2 == 1 + sum1 += UInt16(data[1]) << 8 + sum2 += sum1 + sum1 = (sum1 & 0xffff) + (sum1 >> 16) + sum2 = (sum2 & 0xffff) + (sum2 >> 16) + end + return (sum2 << 16) | sum1 +end + +function zencode(data, ::Fletcher32Filter) + bytes = reinterpret(UInt8, vec(data)) + checksum = _checksum_fletcher32(bytes) + result = copy(bytes) + append!(result, reinterpret(UInt8, [checksum])) # TODO: decompose this without the extra allocation of wrapping in Array + return result +end + +function zdecode(data, ::Fletcher32Filter) + bytes = reinterpret(UInt8, data) + checksum = _checksum_fletcher32(view(bytes, 1:length(bytes) - 4)) + stored_checksum = only(reinterpret(UInt32, view(bytes, (length(bytes) - 3):length(bytes)))) + if checksum != stored_checksum + throw(ErrorException(""" + Checksum mismatch in Fletcher32 decoding. + + The computed value is $(checksum) and the stored value is $(stored_checksum). + This might be a sign that the data is corrupted. + """)) # TODO: make this a custom error type + end + return view(bytes, 1:length(bytes) - 4) +end diff --git a/src/Filters/quantize.jl b/src/Filters/quantize.jl new file mode 100644 index 0000000..c5d7c9a --- /dev/null +++ b/src/Filters/quantize.jl @@ -0,0 +1,52 @@ +#= +# Quantize compression + + +=# + +""" + QuantizeFilter(; digits, DecodingType, [EncodingType = DecodingType]) + +Quantization based compression for Zarr arrays. +""" +struct QuantizeFilter{T, TENC} <: Filter{T, TENC} + digits::Int32 +end + +function QuantizeFilter(; digits = 10, T = Float16, Tenc = T) + return QuantizeFilter{T, Tenc}(digits) +end + +QuantizeFilter{T, Tenc}(; digits = 10) where {T, Tenc} = QuantizeFilter{T, Tenc}(digits) +QuantizeFilter{T}(; digits = 10) where T = QuantizeFilter{T, T}(digits) + +function zencode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + arr = reinterpret(DecodingType, vec(data)) + + precision = 10.0^(-filter.digits) + + _exponent = log(10, precision) # log 10 in base `precision` + exponent = _exponent < 0 ? floor(Int, _exponent) : ceil(Int, _exponent) + + bits = ceil(log(2, 10.0^(-exponent))) + scale = 2.0^bits + + enc = @. convert(EncodingType, round(scale * arr) / scale) + + return enc +end + +# Decoding is a no-op; quantization is a lossy filter but data is encoded directly. +function zdecode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + return data +end + +function JSON.lower(filter::QuantizeFilter{T, Tenc}) where {T, Tenc} + return Dict("id" => "quantize", "digits" => filter.digits, "dtype" => typestr(T), "astype" => typestr(Tenc)) +end + +function getfilter(::Type{<: QuantizeFilter}, d) + return QuantizeFilter{typestr(d["dtype"], typestr(d["astype"]))}(; digits = d["digits"]) +end + +filterdict["quantize"] = QuantizeFilter \ No newline at end of file diff --git a/src/Filters/shuffle.jl b/src/Filters/shuffle.jl new file mode 100644 index 0000000..6a01f5d --- /dev/null +++ b/src/Filters/shuffle.jl @@ -0,0 +1,70 @@ +#= +# Shuffle compression + +This file implements the shuffle compressor. +=# + +struct ShuffleFilter <: Filter{UInt8, UInt8} + elementsize::Csize_t +end + +ShuffleFilter(; elementsize = 4) = ShuffleFilter(elementsize) + +function _do_shuffle!(dest::AbstractVector{UInt8}, source::AbstractVector{UInt8}, elementsize::Csize_t) + count = fld(length(source), elementsize) # elementsize is in bytes, so this works + for i in 0:(count-1) + offset = i * elementsize + for byte_index in 0:(elementsize-1) + j = byte_index * count + i + dest[j+1] = source[offset + byte_index+1] + end + end +end + +function _do_unshuffle!(dest::AbstractVector{UInt8}, source::AbstractVector{UInt8}, elementsize::Csize_t) + count = fld(length(source), elementsize) # elementsize is in bytes, so this works + for i in 0:(elementsize-1) + offset = i * count + for byte_index in 0:(count-1) + j = byte_index * elementsize + i + dest[j+1] = source[offset + byte_index+1] + end + end +end + +function zencode(a::AbstractArray, c::ShuffleFilter) + if c.elementsize <= 1 # no shuffling needed if elementsize is 1 + return a + end + source = reinterpret(UInt8, vec(a)) + dest = Vector{UInt8}(undef, length(source)) + _do_shuffle!(dest, source, c.elementsize) + return dest +end + +function zdecode(a::AbstractArray, c::ShuffleFilter) + if c.elementsize <= 1 # no shuffling needed if elementsize is 1 + return a + end + source = reinterpret(UInt8, vec(a)) + dest = Vector{UInt8}(undef, length(source)) + _do_unshuffle!(dest, source, c.elementsize) + return dest +end + +function getfilter(::Type{ShuffleFilter}, d::Dict) + return ShuffleFilter(d["elementsize"]) +end + +function JSON.lower(c::ShuffleFilter) + return Dict("id" => "shuffle", "elementsize" => Int64(c.elementsize)) +end + +filterdict["shuffle"] = ShuffleFilter +#= + +# Tests + + + +=# \ No newline at end of file diff --git a/src/Filters.jl b/src/Filters/vlenfilters.jl similarity index 72% rename from src/Filters.jl rename to src/Filters/vlenfilters.jl index fde3db9..dad91df 100644 --- a/src/Filters.jl +++ b/src/Filters/vlenfilters.jl @@ -1,38 +1,24 @@ -import JSON +#= +# Variable-length filters +This file implements variable-length filters for Zarr, i.e., filters that write arrays of variable-length arrays ("ragged arrays"). -abstract type Filter{T,TENC} end -function getfilters(d::Dict) - if !haskey(d,"filters") - return nothing - else - if d["filters"] === nothing || isempty(d["filters"]) - return nothing - end - f = map(d["filters"]) do f - getfilter(filterdict[f["id"]], f) - end - return (f...,) - end -end -sourcetype(::Filter{T}) where T = T -desttype(::Filter{<:Any,T}) where T = T +Specifically, it implements the `VLenArrayFilter` and `VLenUTF8Filter` types, which are used to encode and decode variable-length arrays and UTF-8 strings, respectively. +=# -zencode(ain,::Nothing) = ain +# ## VLenArrayFilter """ VLenArrayFilter(T) -Encodes and decodes variable-length arrays of arbitrary data type +Encodes and decodes variable-length arrays of arbitrary data type `T`. """ struct VLenArrayFilter{T} <: Filter{T,UInt8} end +# We don't need to define `sourcetype` and `desttype` for this filter, since the generic implementations are sufficient. -""" - VLenUTF8Filter - -Encodes and decodes variable-length unicode strings -""" -struct VLenUTF8Filter <: Filter{String, UInt8} end +JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) ) +getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}() +filterdict["vlen-array"] = VLenArrayFilter function zdecode(ain, ::VLenArrayFilter{T}) where T f = IOBuffer(ain) @@ -46,7 +32,7 @@ function zdecode(ain, ::VLenArrayFilter{T}) where T out end -#Encodes Array of Vectors a into bytes +#Encodes Array of Vectors `ain` into bytes function zencode(ain,::VLenArrayFilter) b = IOBuffer() nitems = length(ain) @@ -58,6 +44,19 @@ function zencode(ain,::VLenArrayFilter) take!(b) end +# ## VLenUTF8Filter + +""" + VLenUTF8Filter + +Encodes and decodes variable-length unicode strings +""" +struct VLenUTF8Filter <: Filter{String, UInt8} end + +JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8") +getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter() +filterdict["vlen-utf8"] = VLenUTF8Filter + function zdecode(ain, ::VLenUTF8Filter) f = IOBuffer(ain) nitems = read(f, UInt32) @@ -81,11 +80,3 @@ function zencode(ain, ::VLenUTF8Filter) end take!(b) end - -JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) ) -JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8") - -getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}() -getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter() - -filterdict = Dict("vlen-array"=>VLenArrayFilter, "vlen-utf8"=>VLenUTF8Filter) diff --git a/src/Zarr.jl b/src/Zarr.jl index ecc221a..dbdeb9a 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -6,7 +6,7 @@ import Blosc include("metadata.jl") include("Compressors/Compressors.jl") include("Storage/Storage.jl") -include("Filters.jl") +include("Filters/Filters.jl") include("ZArray.jl") include("ZGroup.jl") diff --git a/src/metadata.jl b/src/metadata.jl index f3dc5df..b677cc0 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -156,6 +156,12 @@ function Metadata(d::AbstractDict, fill_as_missing) # create a Metadata struct from it compdict = d["compressor"] + if isnothing(compdict) + # try the last filter, for Kerchunk compat + if !isnothing(d["filters"]) && haskey(compressortypes, d["filters"][end]["id"]) + compdict = pop!(d["filters"]) # TODO: this will not work with JSON3! + end + end compressor = getCompressor(compdict) filters = getfilters(d) @@ -216,5 +222,6 @@ Base.eltype(::Metadata{T}) where T = T fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) fill_value_decoding(v::Nothing, ::Any) = v fill_value_decoding(v, T) = T(v) +fill_value_decoding(v::Integer, T::Type{<: Unsigned}) = reinterpret(T, signed(T)(v)) fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v]) fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v diff --git a/test/Filters.jl b/test/Filters.jl new file mode 100644 index 0000000..f46cf4a --- /dev/null +++ b/test/Filters.jl @@ -0,0 +1,153 @@ +using Test +using Zarr: DateTime64 # for datetime reinterpret + +using Zarr: zencode, zdecode +using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFilter, DeltaFilter + +@testset "Fletcher32Filter" begin + # These tests are copied exactly from the [`numcodecs`](https://github.com/zarr-developers/numcodecs/) Python package, + # specifically [this file](https://github.com/zarr-developers/numcodecs/blob/main/numcodecs/tests/test_fletcher32.py). + + bit_data = vcat( + b"w\x07\x00\x00\x00\x00\x00\x00\x85\xf6\xff\xff\xff\xff\xff\xff", + b"i\x07\x00\x00\x00\x00\x00\x00\x94\xf6\xff\xff\xff\xff\xff\xff", + b"\x88\t\x00\x00\x00\x00\x00\x00i\x03\x00\x00\x00\x00\x00\x00", + b"\x93\xfd\xff\xff\xff\xff\xff\xff\xc3\xfc\xff\xff\xff\xff\xff\xff", + b"'\x02\x00\x00\x00\x00\x00\x00\xba\xf7\xff\xff\xff\xff\xff\xff", + b"\xfd%\x86d", + ) + expected = [1911, -2427, 1897, -2412, 2440, 873, -621, -829, 551, -2118] + @test reinterpret(Int64, zdecode(bit_data, Fletcher32Filter())) == expected + @test zencode(expected, Fletcher32Filter()) == bit_data + + for Typ in (UInt8, Int32, Float32, Float64) + arr = rand(Typ, 100) + @test reinterpret(Typ, zdecode(zencode(arr, Fletcher32Filter()), Fletcher32Filter())) == arr + end + + data = rand(100) + enc = zencode(data, Fletcher32Filter()) + enc[begin] += 1 + @test_throws "Checksum mismatch in Fletcher32 decoding" zdecode(enc, Fletcher32Filter()) +end + +@testset "FixedScaleOffsetFilter" begin + arrays = [ + LinRange{Float64}(1000, 1001, 1000), + randn(1000) .+ 1000, + reshape(LinRange{Float64}(1000, 1001, 1000), (100, 10)), + reshape(LinRange{Float64}(1000, 1001, 1000), (10, 10, 10)), + ] + + codecs = [ + FixedScaleOffsetFilter(offset = 1000, scale = 1, T = Float64, Tenc = Int8), + FixedScaleOffsetFilter(offset = 1000, scale = 10^2, T = Float64, Tenc = Int16), + FixedScaleOffsetFilter(offset = 1000, scale = 10^6, T = Float64, Tenc = Int32), + FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64, Tenc = Int64), + FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64), + ] + + for array in arrays + for codec in codecs + encoded = Zarr.zencode(array, codec) + decoded = Zarr.zdecode(encoded, codec) + decimal = round(log10(codec.scale)) + @test decoded ≈ array rtol=1.5*10^(-decimal) + end + end +end + +@testset "ShuffleFilter" begin + + codecs = [ + ShuffleFilter(), + ShuffleFilter(elementsize=0), + ShuffleFilter(elementsize=4), + ShuffleFilter(elementsize=8), + ] + + arrays = [ + Int32.(collect(1:1000)), # equivalent to np.arange(1000, dtype='i4') + LinRange(1000, 1001, 1000), # equivalent to np.linspace(1000, 1001, 1000, dtype='f8') + reshape(randn(1000) .* 1 .+ 1000, (100, 10)), # equivalent to np.random.normal(loc=1000, scale=1, size=(100, 10)) + reshape(rand(Bool, 1000), (10, 100)), # equivalent to np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F') + reshape(rand(Zarr.MaxLengthString{3, UInt8}["a", "bb", "ccc"], 1000), (10, 10, 10)), # equivalent to np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10) + reinterpret(DateTime64{Dates.Nanosecond}, rand(UInt64(0):UInt64(2^60)-1, 1000)), # equivalent to np.random.randint(0, 2**60, size=1000, dtype='u8').view('M8[ns]') + Nanosecond.(rand(UInt64(0):UInt64(2^60-1), 1000)), # equivalent to np.random.randint(0, 2**60, size=1000, dtype='u8').view('m8[ns]') + reinterpret(DateTime64{Dates.Minute}, rand(UInt64(0):UInt64(2^25-1), 1000)), # equivalent to np.random.randint(0, 2**25, size=1000, dtype='u8').view('M8[m]') + Minute.(rand(UInt64(0):UInt64(2^25-1), 1000)), # equivalent to np.random.randint(0, 2**25, size=1000, dtype='u8').view('m8[m]') + reinterpret(DateTime64{Dates.Nanosecond}, rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[ns]') + Nanosecond.(rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[ns]') + reinterpret(DateTime64{Dates.Minute}, rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[m]') + Minute.(rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[m]') + ] + + for codec in codecs + for array in arrays + encoded = Zarr.zencode(array, codec) + decoded = reshape(reinterpret(eltype(array), Zarr.zdecode(encoded, codec)), size(array)) + @test decoded == array + end + end +end + + +@testset "QuantizeFilter" begin + + codecs = [ + QuantizeFilter{Float64, Float16}(digits=-1), + QuantizeFilter{Float64, Float16}(digits=0), + QuantizeFilter{Float64, Float16}(digits=1), + QuantizeFilter{Float64, Float32}(digits=5), + QuantizeFilter{Float64, Float64}(digits=12), + ] + + arrays = [ + LinRange(100, 200, 1000), # np.linspace(100, 200, 1000, dtype='"One", "Int attribute"=>5, "Float attribut g = zgroup(pjulia,attrs=groupattrs) # Test all supported data types and compressors -import Zarr: NoCompressor, BloscCompressor, ZlibCompressor, MaxLengthString +import Zarr: NoCompressor, BloscCompressor, ZlibCompressor, MaxLengthString, + Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFilter, DeltaFilter using Random: randstring -dtypes = (UInt8, UInt16, UInt32, UInt64, +numeric_dtypes = (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float16, Float32, Float64, Complex{Float32}, Complex{Float64}, - Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32}, + Bool,) +dtypes = (numeric_dtypes..., + MaxLengthString{10,UInt8},MaxLengthString{10,UInt32}, String) compressors = ( "no"=>NoCompressor(), @@ -37,9 +40,17 @@ compressors = ( "blosc_noshuffle"=>BloscCompressor(cname="zstd",shuffle=0), "blosc_bitshuffle"=>BloscCompressor(cname="zstd",shuffle=2), "zlib"=>ZlibCompressor()) +filters = ( + "fletcher32"=>Fletcher32Filter(), + "scale_offset"=>FixedScaleOffsetFilter(offset=1000, scale=10^6, T=Float64, Tenc=Int32), + "shuffle"=>ShuffleFilter(elementsize=4), + "quantize"=>QuantizeFilter{Float64,Float32}(digits=5), + "delta"=>DeltaFilter{Int32}() +) testarrays = Dict(t=>(t<:AbstractString) ? [randstring(maximum(i.I)) for i in CartesianIndices((1:10,1:6,1:2))] : rand(t,10,6,2) for t in dtypes) testzerodimarrays = Dict(t=>(t<:AbstractString) ? randstring(10) : rand(t) for t in dtypes) +# Test arrays with compressors for t in dtypes, co in compressors compstr, comp = co att = Dict("This is a nested attribute"=>Dict("a"=>5)) @@ -49,6 +60,21 @@ for t in dtypes, co in compressors a = zcreate(t, g,string("azerodim",t,compstr), compressor=comp) a[] = testzerodimarrays[t] end + +# Test arrays with filters +for (filterstr, filter) in filters + t = eltype(filter) == Any ? Float64 : eltype(filter) + att = Dict("Filter test attribute"=>Dict("b"=>6)) + a = zcreate(t, g,string("filter_",filterstr),10,6,2,attrs=att, chunks = (5,2,2),filters=[filter]) + testdata = rand(t,10,6,2) + a[:,:,:] = testdata + + # Test zero-dimensional array + a = zcreate(t, g,string("filter_zerodim_",filterstr), filters=[filter]) + testzerodim = rand(t) + a[] = testzerodim +end + #Also save as zip file. open(pjulia*".zip";write=true) do io Zarr.writezip(io, g) @@ -58,6 +84,7 @@ end for julia_path in (pjulia, pjulia*".zip") py""" import zarr +import numcodecs g = zarr.open_group($julia_path) gatts = g.attrs """ @@ -67,7 +94,6 @@ gatts = g.attrs @test py"gatts['Int attribute']" == 5 @test py"gatts['Float attribute']" == 10.5 - dtypesp = ("uint8","uint16","uint32","uint64", "int8","int16","int32","int64", "float16","float32","float64", @@ -95,6 +121,30 @@ for i=1:length(dtypes), co in compressors end end +# Test reading filtered arrays from python +for (filterstr, filter) in filters + t = eltype(filter) == Any ? Float64 : eltype(filter) + arname = string("filter_",filterstr) + try + py""" + ar=g[$arname] + """ + catch e + @error "Error loading group with filter $filterstr" exception=(e,catch_backtrace()) + @test false # test failed. + end + + @test py"ar.attrs['Filter test attribute']" == Dict("b"=>6) + @test py"ar.shape" == (2,6,10) + + # Test zero-dimensional filtered array + arname = string("filter_zerodim_",filterstr) + py""" + ar_zero=g[$arname] + """ + @test py"ar_zero.shape" == () +end + for i=1:length(dtypes), co in compressors compstr,comp = co t = dtypes[i] @@ -244,6 +294,4 @@ for unit in ["Week", "Day", "Hour", "Minute", "Second", @test_py np.datetime64(g_julia[unit][100] |> DateTime |> string) == get(getproperty(g_python,unit),99) end - - end diff --git a/test/runtests.jl b/test/runtests.jl index 78067b9..007defc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -267,7 +267,7 @@ end include("storage.jl") - +include("Filters.jl") include("python.jl")