From f348c46f0d8f322f87e744824d70c2397cd3a8a7 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 20 Aug 2024 09:38:55 -0700 Subject: [PATCH 01/31] Document the Filter interface --- src/Filters.jl | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/Filters.jl b/src/Filters.jl index fde3db9..85363fc 100644 --- a/src/Filters.jl +++ b/src/Filters.jl @@ -1,6 +1,27 @@ import JSON +""" + abstract type Filter{T,TENC} + +The supertype for all Zarr filters. + +## Interface + +All subtypes MUST implement the following methods: + +- [`zencode(ain, filter::Filter)`](@ref zencode): Encodes data `ain` using the filter, and returns a vector of bytes. +- [`zdecode(ain, filter::Filter)`](@ref zdecode): Decodes data `ain`, a vector of bytes, using the filter, and returns the original data. +- [`JSON.lower`](@ref): Returns a JSON-serializable dictionary representing the filter, according to the Zarr specification. +- [`getfilter(::Type{<: Filter}, filterdict)`](@ref getfilter): Returns the filter type read from a given filter dictionary. +If the filter has type parameters, it MUST also implement: +- [`sourcetype(::Filter)::T`](@ref sourcetype): equivalent to `dtype` in the Python Zarr implementation. +- [`desttype(::Filter)::T`](@ref desttype): equivalent to `atype` in the Python Zarr implementation. + + + +Subtypes include: [`VLenArrayFilter`](@ref), [`VLenUTF8Filter`](@ref), [`Fletcher32Filter`](@ref). +""" abstract type Filter{T,TENC} end function getfilters(d::Dict) if !haskey(d,"filters") From 9d765b3104ca504525b86399c2e8345185d3ef89 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 20 Aug 2024 09:44:43 -0700 Subject: [PATCH 02/31] Move filters to a folder Same rationale as the other changes :D - just for cleanliness and clarity. --- src/{ => Filters}/Filters.jl | 11 ++++++++++- src/Zarr.jl | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) rename src/{ => Filters}/Filters.jl (95%) diff --git a/src/Filters.jl b/src/Filters/Filters.jl similarity index 95% rename from src/Filters.jl rename to src/Filters/Filters.jl index 85363fc..51325ac 100644 --- a/src/Filters.jl +++ b/src/Filters/Filters.jl @@ -23,6 +23,15 @@ If the filter has type parameters, it MUST also implement: Subtypes include: [`VLenArrayFilter`](@ref), [`VLenUTF8Filter`](@ref), [`Fletcher32Filter`](@ref). """ abstract type Filter{T,TENC} end + +function zencode end +function zdecode end +function getfilter end +function sourcetype end +function desttype end + +filterdict = Dict{String,Type{<:Filter}}() + function getfilters(d::Dict) if !haskey(d,"filters") return nothing @@ -41,6 +50,7 @@ desttype(::Filter{<:Any,T}) where T = T zencode(ain,::Nothing) = ain + """ VLenArrayFilter(T) @@ -109,4 +119,3 @@ JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8") getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}() getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter() -filterdict = Dict("vlen-array"=>VLenArrayFilter, "vlen-utf8"=>VLenUTF8Filter) diff --git a/src/Zarr.jl b/src/Zarr.jl index 5f58e61..47a2539 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -6,7 +6,7 @@ import Blosc include("metadata.jl") include("Compressors.jl") include("Storage/Storage.jl") -include("Filters.jl") +include("Filters/Filters.jl") include("ZArray.jl") include("ZGroup.jl") From cb374ce4008e26ba6a8e731a9cf622d9403c31aa Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 20 Aug 2024 09:47:47 -0700 Subject: [PATCH 03/31] Factor out variable-length filters to a new file --- src/Filters/Filters.jl | 69 +------------------------------- src/Filters/vlenfilters.jl | 80 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 68 deletions(-) create mode 100644 src/Filters/vlenfilters.jl diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index 51325ac..3eb04d4 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -50,72 +50,5 @@ desttype(::Filter{<:Any,T}) where T = T zencode(ain,::Nothing) = ain - -""" - VLenArrayFilter(T) - -Encodes and decodes variable-length arrays of arbitrary data type -""" -struct VLenArrayFilter{T} <: Filter{T,UInt8} end - -""" - VLenUTF8Filter - -Encodes and decodes variable-length unicode strings -""" -struct VLenUTF8Filter <: Filter{String, UInt8} end - -function zdecode(ain, ::VLenArrayFilter{T}) where T - f = IOBuffer(ain) - nitems = read(f, UInt32) - out = Array{Vector{T}}(undef,nitems) - for i=1:nitems - len1 = read(f,UInt32) - out[i] = read!(f,Array{T}(undef,len1 ÷ sizeof(T))) - end - close(f) - out -end - -#Encodes Array of Vectors a into bytes -function zencode(ain,::VLenArrayFilter) - b = IOBuffer() - nitems = length(ain) - write(b,UInt32(nitems)) - for a in ain - write(b, UInt32(length(a) * sizeof(eltype(a)))) - write(b, a) - end - take!(b) -end - -function zdecode(ain, ::VLenUTF8Filter) - f = IOBuffer(ain) - nitems = read(f, UInt32) - out = Array{String}(undef, nitems) - for i in 1:nitems - clen = read(f, UInt32) - out[i] = String(read(f, clen)) - end - close(f) - out -end - -function zencode(ain, ::VLenUTF8Filter) - b = IOBuffer() - nitems = length(ain) - write(b, UInt32(nitems)) - for a in ain - utf8encoded = transcode(String, a) - write(b, UInt32(ncodeunits(utf8encoded))) - write(b, utf8encoded) - end - take!(b) -end - -JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) ) -JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8") - -getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}() -getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter() +include("vlenfilters.jl") diff --git a/src/Filters/vlenfilters.jl b/src/Filters/vlenfilters.jl new file mode 100644 index 0000000..e9331f9 --- /dev/null +++ b/src/Filters/vlenfilters.jl @@ -0,0 +1,80 @@ +#= +# Variable-length filters + +This file implements variable-length filters for Zarr, i.e., filters that write arrays of variable-length arrays ("ragged arrays"). + +Specifically, it implements the `VLenArrayFilter` and `VLenUTF8Filter` types, which are used to encode and decode variable-length arrays and UTF-8 strings, respectively. +=# + +# ## VLenArrayFilter + +""" + VLenArrayFilter(T) + +Encodes and decodes variable-length arrays of arbitrary data type `T`. +""" +struct VLenArrayFilter{T} <: Filter{T,UInt8} end +# We don't need to define `sourcetype` and `desttype` for this filter, since the generic implementations are sufficient. + +JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) ) +getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}() + +function zdecode(ain, ::VLenArrayFilter{T}) where T + f = IOBuffer(ain) + nitems = read(f, UInt32) + out = Array{Vector{T}}(undef,nitems) + for i=1:nitems + len1 = read(f,UInt32) + out[i] = read!(f,Array{T}(undef,len1 ÷ sizeof(T))) + end + close(f) + out +end + +#Encodes Array of Vectors `ain` into bytes +function zencode(ain,::VLenArrayFilter) + b = IOBuffer() + nitems = length(ain) + write(b,UInt32(nitems)) + for a in ain + write(b, UInt32(length(a) * sizeof(eltype(a)))) + write(b, a) + end + take!(b) +end + +# ## VLenUTF8Filter + +""" + VLenUTF8Filter + +Encodes and decodes variable-length unicode strings +""" +struct VLenUTF8Filter <: Filter{String, UInt8} end + +JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8") +getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter() + +function zdecode(ain, ::VLenUTF8Filter) + f = IOBuffer(ain) + nitems = read(f, UInt32) + out = Array{String}(undef, nitems) + for i in 1:nitems + clen = read(f, UInt32) + out[i] = String(read(f, clen)) + end + close(f) + out +end + +function zencode(ain, ::VLenUTF8Filter) + b = IOBuffer() + nitems = length(ain) + write(b, UInt32(nitems)) + for a in ain + utf8encoded = transcode(String, a) + write(b, UInt32(ncodeunits(utf8encoded))) + write(b, utf8encoded) + end + take!(b) +end From 12931a2551ef44e15c2644dca9d8a5dfbde98d04 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 20 Aug 2024 09:49:50 -0700 Subject: [PATCH 04/31] Add docstrings to filter API functions --- src/Filters/Filters.jl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index 3eb04d4..c2a7965 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -24,10 +24,39 @@ Subtypes include: [`VLenArrayFilter`](@ref), [`VLenUTF8Filter`](@ref), [`Fletche """ abstract type Filter{T,TENC} end +""" + zencode(ain, filter::Filter) + +Encodes data `ain` using the filter, and returns a vector of bytes. +""" function zencode end + +""" + zdecode(ain, filter::Filter) + +Decodes data `ain`, a vector of bytes, using the filter, and returns the original data. +""" function zdecode end + +""" + getfilter(::Type{<: Filter}, filterdict) + +Returns the filter type read from a given specification dictionary, which must follow the Zarr specification. +""" function getfilter end + +""" + sourcetype(::Filter)::T + +Returns the source type of the filter. +""" function sourcetype end + +""" + desttype(::Filter)::T + +Returns the destination type of the filter. +""" function desttype end filterdict = Dict{String,Type{<:Filter}}() From 7d7606af8c05143aa3c2d4bb73483177a756ec26 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 20 Aug 2024 10:09:18 -0700 Subject: [PATCH 05/31] Add a Fletcher32 filter and test --- src/Filters/Filters.jl | 5 ++- src/Filters/fletcher32.jl | 85 +++++++++++++++++++++++++++++++++++++++ test/Filters.jl | 31 ++++++++++++++ test/runtests.jl | 2 +- 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 src/Filters/fletcher32.jl create mode 100644 test/Filters.jl diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index c2a7965..77d3764 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -18,6 +18,9 @@ If the filter has type parameters, it MUST also implement: - [`sourcetype(::Filter)::T`](@ref sourcetype): equivalent to `dtype` in the Python Zarr implementation. - [`desttype(::Filter)::T`](@ref desttype): equivalent to `atype` in the Python Zarr implementation. +Finally, an entry MUST be added to the `filterdict` dictionary for each filter type. +This must also follow the Zarr specification's name for that filter. The name of the filter +is the key, and the value is the filter type (e.g. `VLenUInt8Filter` or `Fletcher32Filter`). Subtypes include: [`VLenArrayFilter`](@ref), [`VLenUTF8Filter`](@ref), [`Fletcher32Filter`](@ref). @@ -80,4 +83,4 @@ desttype(::Filter{<:Any,T}) where T = T zencode(ain,::Nothing) = ain include("vlenfilters.jl") - +include("fletcher32.jl") diff --git a/src/Filters/fletcher32.jl b/src/Filters/fletcher32.jl new file mode 100644 index 0000000..4d2bda7 --- /dev/null +++ b/src/Filters/fletcher32.jl @@ -0,0 +1,85 @@ +#= +# Fletcher32 filter + +This "filter" basically injects a 4-byte checksum at the end of the data, to ensure data integrity. + +The implementation is based on the [numcodecs implementation here](https://github.com/zarr-developers/numcodecs/blob/79d1a8d4f9c89d3513836aba0758e0d2a2a1cfaf/numcodecs/fletcher32.pyx) +and the [original C implementation for NetCDF](https://github.com/Unidata/netcdf-c/blob/main/plugins/H5checksum.c#L109) linked therein. + +=# + +""" + Fletcher32Filter() + +A compressor that uses the Fletcher32 checksum algorithm to compress and uncompress data. + +Note that this goes from UInt8 to UInt8, and is effectively only checking +the checksum and cropping the last 4 bytes of the data during decoding. +""" +struct Fletcher32Filter <: Filter{UInt8, UInt8} +end + +getFilter(::Type{<: Fletcher32Filter}, d::Dict) = Fletcher32Filter() +JSON.lower(::Fletcher32Filter) = Dict("id" => "fletcher32") +filterdict["fletcher32"] = Fletcher32Filter + +function _checksum_fletcher32(data::AbstractVector{UInt8}) + len = length(data) / 2 # length in 16-bit words + sum1::UInt32 = 0 + sum2::UInt32 = 0 + data_idx = 1 + + #= + Compute the checksum for pairs of bytes. + The magic `360` value is the largest number of sums that can be performed without overflow in UInt32. + =# + while len > 0 + tlen = len > 360 ? 360 : len + len -= tlen + while tlen > 0 + sum1 += begin # create a 16 bit word from two bytes, the first one shifted to the end of the word + (UInt16(data[data_idx]) << 8) | UInt16(data[data_idx + 1]) + end + sum2 += sum1 + data_idx += 2 + tlen -= 1 + if tlen < 1 + break + end + end + sum1 = (sum1 & 0xffff) + (sum1 >> 16) + sum2 = (sum2 & 0xffff) + (sum2 >> 16) + end + + # if the length of the data is odd, add the first byte to the checksum again (?!) + if length(data) % 2 == 1 + sum1 += UInt16(data[1]) << 8 + sum2 += sum1 + sum1 = (sum1 & 0xffff) + (sum1 >> 16) + sum2 = (sum2 & 0xffff) + (sum2 >> 16) + end + return (sum2 << 16) | sum1 +end + +function zencode(data, ::Fletcher32Filter) + bytes = reinterpret(UInt8, data) + checksum = _checksum_fletcher32(bytes) + result = copy(bytes) + append!(result, reinterpret(UInt8, [checksum])) # TODO: decompose this without the extra allocation of wrapping in Array + return result +end + +function zdecode(data, ::Fletcher32Filter) + bytes = reinterpret(UInt8, data) + checksum = _checksum_fletcher32(view(bytes, 1:length(bytes) - 4)) + stored_checksum = only(reinterpret(UInt32, view(bytes, (length(bytes) - 3):length(bytes)))) + if checksum != stored_checksum + throw(ErrorException(""" + Checksum mismatch in Fletcher32 decoding. + + The computed value is $(checksum) and the stored value is $(stored_checksum). + This might be a sign that the data is corrupted. + """)) # TODO: make this a custom error type + end + return view(bytes, 1:length(bytes) - 4) +end diff --git a/test/Filters.jl b/test/Filters.jl new file mode 100644 index 0000000..426390d --- /dev/null +++ b/test/Filters.jl @@ -0,0 +1,31 @@ +using Test + +using Zarr: zencode, zdecode +using Zarr: Fletcher32Filter + +@testset "Fletcher32Filter" begin + # These tests are copied exactly from the [`numcodecs`](https://github.com/zarr-developers/numcodecs/) Python package, + # specifically [this file](https://github.com/zarr-developers/numcodecs/blob/main/numcodecs/tests/test_fletcher32.py). + + bit_data = vcat( + b"w\x07\x00\x00\x00\x00\x00\x00\x85\xf6\xff\xff\xff\xff\xff\xff", + b"i\x07\x00\x00\x00\x00\x00\x00\x94\xf6\xff\xff\xff\xff\xff\xff", + b"\x88\t\x00\x00\x00\x00\x00\x00i\x03\x00\x00\x00\x00\x00\x00", + b"\x93\xfd\xff\xff\xff\xff\xff\xff\xc3\xfc\xff\xff\xff\xff\xff\xff", + b"'\x02\x00\x00\x00\x00\x00\x00\xba\xf7\xff\xff\xff\xff\xff\xff", + b"\xfd%\x86d", + ) + expected = [1911, -2427, 1897, -2412, 2440, 873, -621, -829, 551, -2118] + @test reinterpret(Int64, zdecode(bit_data, Fletcher32Filter())) == expected + @test zencode(expected, Fletcher32Filter()) == bit_data + + for Typ in (UInt8, Int32, Float32, Float64) + arr = rand(Typ, 100) + @test reinterpret(Typ, zdecode(zencode(arr, Fletcher32Filter()), Fletcher32Filter())) == arr + end + + data = rand(100) + enc = zencode(data, Fletcher32Filter()) + enc[begin] += 1 + @test_throws "Checksum mismatch in Fletcher32 decoding" zdecode(enc, Fletcher32Filter()) +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 78067b9..007defc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -267,7 +267,7 @@ end include("storage.jl") - +include("Filters.jl") include("python.jl") From 6a34368201e7c9ffba99613859c98b51c656d29a Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 20 Aug 2024 10:09:29 -0700 Subject: [PATCH 06/31] re-add the dictionary entries for the vlen filters --- src/Filters/vlenfilters.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Filters/vlenfilters.jl b/src/Filters/vlenfilters.jl index e9331f9..dad91df 100644 --- a/src/Filters/vlenfilters.jl +++ b/src/Filters/vlenfilters.jl @@ -18,6 +18,7 @@ struct VLenArrayFilter{T} <: Filter{T,UInt8} end JSON.lower(::VLenArrayFilter{T}) where T = Dict("id"=>"vlen-array","dtype"=> typestr(T) ) getfilter(::Type{<:VLenArrayFilter}, f) = VLenArrayFilter{typestr(f["dtype"])}() +filterdict["vlen-array"] = VLenArrayFilter function zdecode(ain, ::VLenArrayFilter{T}) where T f = IOBuffer(ain) @@ -54,6 +55,7 @@ struct VLenUTF8Filter <: Filter{String, UInt8} end JSON.lower(::VLenUTF8Filter) = Dict("id"=>"vlen-utf8") getfilter(::Type{<:VLenUTF8Filter}, f) = VLenUTF8Filter() +filterdict["vlen-utf8"] = VLenUTF8Filter function zdecode(ain, ::VLenUTF8Filter) f = IOBuffer(ain) From fbf911e91179aeed3c22af812e0a788f6f9d29ee Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 20 Aug 2024 15:15:39 -0700 Subject: [PATCH 07/31] Semi-working fixed scale offset filter --- src/Filters/Filters.jl | 1 + src/Filters/fixedscaleoffset.jl | 74 +++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 src/Filters/fixedscaleoffset.jl diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index 77d3764..72816d1 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -84,3 +84,4 @@ zencode(ain,::Nothing) = ain include("vlenfilters.jl") include("fletcher32.jl") +include("fixedscaleoffset.jl") diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl new file mode 100644 index 0000000..2109399 --- /dev/null +++ b/src/Filters/fixedscaleoffset.jl @@ -0,0 +1,74 @@ + +""" + FixedScaleOffsetFilter{T,TENC}(scale, offset) + +A compressor that scales and offsets the data. +""" +struct FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc} <: Filter{T, Tenc} + scale::ScaleOffsetType + offset::ScaleOffsetType +end + +FixedScaleOffsetFilter{T}(scale::ScaleOffsetType, offset::ScaleOffsetType) where {T, ScaleOffsetType} = FixedScaleOffsetFilter{T, ScaleOffsetType}(scale, offset) +FixedScaleOffsetFilter(scale::ScaleOffsetType, offset::ScaleOffsetType) where {ScaleOffsetType} = FixedScaleOffsetFilter{ScaleOffsetType, ScaleOffsetType}(scale, offset) + +function FixedScaleOffsetFilter(; scale::ScaleOffsetType, offset::ScaleOffsetType, T, Tenc = T) where ScaleOffsetType + return FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}(scale, offset) +end + +# function zencode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} +# return @. convert(Tenc, # convert to the encoding type after applying the scale and offset +# round((a - c.offset) * c.scale) # apply scale and offset, and round to nearest integer +# ) +# end + +function zdecode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} + return _reinterpret(Base.nonmissingtype(T), @. a / c.scale + c.offset) +end + + +function getFilter(::Type{<: FixedScaleOffsetFilter}, d::Dict) + scale = d["scale"] + offset = d["offset"] + # Types must be converted from strings to the actual Julia types they represent. + string_T = d["dtype"] + string_Tenc = get(d, "atype", string_T) + T = typestr(string_T) + Tenc = typestr(string_Tenc) + return FixedScaleOffsetFilter{T, Tenc}(scale, offset) +end + +function JSON.lower(c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {ScaleOffsetType, T, Tenc} + return Dict("id" => "fixedscaleoffset", "scale" => c.scale, "offset" => c.offset, "dtype" => typestr(T), "atype" => typestr(Tenc)) +end + +filterdict["fixedscaleoffset"] = FixedScaleOffsetFilter + +#= +# Tests + +arrays = [ + LinRange{Float64}(1000, 1001, 1000), + randn(1000) .+ 1000, + reshape(LinRange{Float64}(1000, 1001, 1000), (100, 10)), + reshape(LinRange{Float64}(1000, 1001, 1000), (10, 10, 10)), +] + +codecs = [ + FixedScaleOffsetFilter(offset = 1000, scale = 1, T = Float64, Tenc = Int8), + FixedScaleOffsetFilter(offset = 1000, scale = 10^2, T = Float64, Tenc = Int16), + FixedScaleOffsetFilter(offset = 1000, scale = 10^6, T = Float64, Tenc = Int32), + FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64, Tenc = Int64), + FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64), +] + +for array in arrays + for codec in codecs + encoded = zencode(array, codec) + decoded = zdecode(encoded, codec) + tolerance = round(Int, log10(codec.scale)) + @test decoded ≈ array atol=tolerance + end +end + +=# \ No newline at end of file From b960c3704090510ecb0f17ad098903ae0fc7ee11 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Thu, 22 Aug 2024 09:30:50 -0700 Subject: [PATCH 08/31] Add FixedScaleOffset tests --- src/Filters/fixedscaleoffset.jl | 39 +++++---------------------------- test/Filters.jl | 31 +++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl index 2109399..80be305 100644 --- a/src/Filters/fixedscaleoffset.jl +++ b/src/Filters/fixedscaleoffset.jl @@ -16,11 +16,11 @@ function FixedScaleOffsetFilter(; scale::ScaleOffsetType, offset::ScaleOffsetTyp return FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}(scale, offset) end -# function zencode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} -# return @. convert(Tenc, # convert to the encoding type after applying the scale and offset -# round((a - c.offset) * c.scale) # apply scale and offset, and round to nearest integer -# ) -# end +function zencode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} + return @. convert(Tenc, # convert to the encoding type after applying the scale and offset + round((a - c.offset) * c.scale) # apply scale and offset, and round to nearest integer + ) +end function zdecode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} return _reinterpret(Base.nonmissingtype(T), @. a / c.scale + c.offset) @@ -43,32 +43,3 @@ function JSON.lower(c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where { end filterdict["fixedscaleoffset"] = FixedScaleOffsetFilter - -#= -# Tests - -arrays = [ - LinRange{Float64}(1000, 1001, 1000), - randn(1000) .+ 1000, - reshape(LinRange{Float64}(1000, 1001, 1000), (100, 10)), - reshape(LinRange{Float64}(1000, 1001, 1000), (10, 10, 10)), -] - -codecs = [ - FixedScaleOffsetFilter(offset = 1000, scale = 1, T = Float64, Tenc = Int8), - FixedScaleOffsetFilter(offset = 1000, scale = 10^2, T = Float64, Tenc = Int16), - FixedScaleOffsetFilter(offset = 1000, scale = 10^6, T = Float64, Tenc = Int32), - FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64, Tenc = Int64), - FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64), -] - -for array in arrays - for codec in codecs - encoded = zencode(array, codec) - decoded = zdecode(encoded, codec) - tolerance = round(Int, log10(codec.scale)) - @test decoded ≈ array atol=tolerance - end -end - -=# \ No newline at end of file diff --git a/test/Filters.jl b/test/Filters.jl index 426390d..002a73f 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -28,4 +28,33 @@ using Zarr: Fletcher32Filter enc = zencode(data, Fletcher32Filter()) enc[begin] += 1 @test_throws "Checksum mismatch in Fletcher32 decoding" zdecode(enc, Fletcher32Filter()) -end \ No newline at end of file +end + +#= +@testset "FixedScaleOffsetFilter" begin + arrays = [ + LinRange{Float64}(1000, 1001, 1000), + randn(1000) .+ 1000, + reshape(LinRange{Float64}(1000, 1001, 1000), (100, 10)), + reshape(LinRange{Float64}(1000, 1001, 1000), (10, 10, 10)), + ] + + codecs = [ + FixedScaleOffsetFilter(offset = 1000, scale = 1, T = Float64, Tenc = Int8), + FixedScaleOffsetFilter(offset = 1000, scale = 10^2, T = Float64, Tenc = Int16), + FixedScaleOffsetFilter(offset = 1000, scale = 10^6, T = Float64, Tenc = Int32), + FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64, Tenc = Int64), + FixedScaleOffsetFilter(offset = 1000, scale = 10^12, T = Float64), + ] + + for array in arrays + for codec in codecs + encoded = Zarr.zencode(array, codec) + decoded = Zarr.zdecode(encoded, codec) + decimal = round(log10(codec.scale)) + @test decoded ≈ array rtol=1.5*10^(-decimal) + end + end +end + +=# \ No newline at end of file From dcae156cabb5742359b9eb1c9731384a2d0f56db Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Thu, 22 Aug 2024 09:31:27 -0700 Subject: [PATCH 09/31] Add shuffle filter (buggy in the last few bytes, indexing issues) --- src/Filters/Filters.jl | 1 + src/Filters/shuffle.jl | 70 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 src/Filters/shuffle.jl diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index 72816d1..d6610a8 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -85,3 +85,4 @@ zencode(ain,::Nothing) = ain include("vlenfilters.jl") include("fletcher32.jl") include("fixedscaleoffset.jl") +include("shuffle.jl") diff --git a/src/Filters/shuffle.jl b/src/Filters/shuffle.jl new file mode 100644 index 0000000..2a4c82f --- /dev/null +++ b/src/Filters/shuffle.jl @@ -0,0 +1,70 @@ +#= +# Shuffle compression + +This file implements the shuffle compressor. +=# + +struct ShuffleFilter <: Filter{UInt8, UInt8} + elementsize::Csize_t +end + +ShuffleFilter(; elementsize = 4) = ShuffleFilter(elementsize) + +function _do_shuffle!(dest::AbstractVector{UInt8}, source::AbstractVector{UInt8}, elementsize::Csize_t) + count = fld(length(source)-1, elementsize) # elementsize is in bytes, so this works + for i in 0:(count-1) + offset = i * elementsize + for byte_index in 0:(elementsize-1) + j = byte_index * count + i + dest[j+1] = source[offset + byte_index+1] + end + end +end + +function _do_unshuffle!(dest::AbstractVector{UInt8}, source::AbstractVector{UInt8}, elementsize::Csize_t) + count = fld(length(source)-1, elementsize) # elementsize is in bytes, so this works + for i in 0:(elementsize-1) + offset = i * count + for byte_index in 0:(count-1) + j = byte_index * elementsize + i + dest[j+1] = source[offset + byte_index+1] + end + end +end + +function zencode(a::AbstractArray, c::ShuffleFilter) + if c.elementsize <= 1 # no shuffling needed if elementsize is 1 + return a + end + source = reinterpret(UInt8, vec(a)) + dest = Vector{UInt8}(undef, length(source)) + _do_shuffle!(dest, source, c.elementsize) + return dest +end + +function zdecode(a::AbstractArray, c::ShuffleFilter) + if c.elementsize <= 1 # no shuffling needed if elementsize is 1 + return a + end + source = reinterpret(UInt8, vec(a)) + dest = Vector{UInt8}(undef, length(source)) + _do_unshuffle!(dest, source, c.elementsize) + return dest +end + +function getFilter(::Type{ShuffleFilter}, d::Dict) + return ShuffleFilter(d["elementsize"]) +end + +function JSON.lower(c::ShuffleFilter) + return Dict("id" => "shuffle", "elementsize" => Int64(c.elementsize)) +end + +filterdict["shuffle"] = ShuffleFilter +#= + +# Tests + + + +=# \ No newline at end of file From 7a5a5a06f81797af7b0a615e1920bff08ef64c0c Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Thu, 22 Aug 2024 09:31:50 -0700 Subject: [PATCH 10/31] WIP quantize filter --- src/Filters/quantize.jl | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 src/Filters/quantize.jl diff --git a/src/Filters/quantize.jl b/src/Filters/quantize.jl new file mode 100644 index 0000000..ac4ef61 --- /dev/null +++ b/src/Filters/quantize.jl @@ -0,0 +1,42 @@ +#= +# Quantize compression + + +=# + +""" + QuantizeFilter(; digits, DecodingType, [EncodingType = DecodingType]) + +Quantization based compression for Zarr arrays. +""" +struct QuantizeFilter{T, TENC} <: Filter{T, TENC} + digits::Int32 +end + +function QuantizeFilter(; digits = 10, T = Float16, Tenc = DecodingType) + return QuantizeFilter{T, Tenc}(digits) +end + +function zencode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + arr = reinterpret(DecodingType, vec(data)) + + precision = 10^(-filter.digits) + + _exponent = log(precision, 10) + exponent = _exponent < 0 ? floor(Int, _exponent) : ceil(Int, _exponent) + + bits = ceil(log(10^(-exponent), 2)) + scale = 2^bits + + enc = @. round(scale * arr) / scale + + if EncodingType == DecodingType + return enc + else + return reinterpret(EncodingType, enc) + end +end + +function zdecode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + return data +end \ No newline at end of file From 231c0a1aaf92305371e0de65500fb70c938c4272 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 23 Aug 2024 14:39:38 -0700 Subject: [PATCH 11/31] ShuffleFilter working and tested --- src/Filters/shuffle.jl | 4 ++-- test/Filters.jl | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/Filters/shuffle.jl b/src/Filters/shuffle.jl index 2a4c82f..0dc49a7 100644 --- a/src/Filters/shuffle.jl +++ b/src/Filters/shuffle.jl @@ -11,7 +11,7 @@ end ShuffleFilter(; elementsize = 4) = ShuffleFilter(elementsize) function _do_shuffle!(dest::AbstractVector{UInt8}, source::AbstractVector{UInt8}, elementsize::Csize_t) - count = fld(length(source)-1, elementsize) # elementsize is in bytes, so this works + count = fld(length(source), elementsize) # elementsize is in bytes, so this works for i in 0:(count-1) offset = i * elementsize for byte_index in 0:(elementsize-1) @@ -22,7 +22,7 @@ function _do_shuffle!(dest::AbstractVector{UInt8}, source::AbstractVector{UInt8} end function _do_unshuffle!(dest::AbstractVector{UInt8}, source::AbstractVector{UInt8}, elementsize::Csize_t) - count = fld(length(source)-1, elementsize) # elementsize is in bytes, so this works + count = fld(length(source), elementsize) # elementsize is in bytes, so this works for i in 0:(elementsize-1) offset = i * count for byte_index in 0:(count-1) diff --git a/test/Filters.jl b/test/Filters.jl index 002a73f..89656cf 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -1,7 +1,8 @@ using Test +using Zarr: DateTime64 # for datetime reinterpret using Zarr: zencode, zdecode -using Zarr: Fletcher32Filter +using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter @testset "Fletcher32Filter" begin # These tests are copied exactly from the [`numcodecs`](https://github.com/zarr-developers/numcodecs/) Python package, @@ -56,5 +57,37 @@ end end end end +=# +@testset "ShuffleFilter" begin -=# \ No newline at end of file + codecs = [ + ShuffleFilter(), + ShuffleFilter(elementsize=0), + ShuffleFilter(elementsize=4), + ShuffleFilter(elementsize=8), + ] + + arrays = [ + Int32.(collect(1:1000)), # equivalent to np.arange(1000, dtype='i4') + LinRange(1000, 1001, 1000), # equivalent to np.linspace(1000, 1001, 1000, dtype='f8') + reshape(randn(1000) .* 1 .+ 1000, (100, 10)), # equivalent to np.random.normal(loc=1000, scale=1, size=(100, 10)) + reshape(rand(Bool, 1000), (10, 100)), # equivalent to np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F') + reshape(rand(Zarr.MaxLengthString{3, UInt8}["a", "bb", "ccc"], 1000), (10, 10, 10)), # equivalent to np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10) + reinterpret(DateTime64{Dates.Nanosecond}, rand(UInt64(0):UInt64(2^60)-1, 1000)), # equivalent to np.random.randint(0, 2**60, size=1000, dtype='u8').view('M8[ns]') + Nanosecond.(rand(UInt64(0):UInt64(2^60-1), 1000)), # equivalent to np.random.randint(0, 2**60, size=1000, dtype='u8').view('m8[ns]') + reinterpret(DateTime64{Dates.Minute}, rand(UInt64(0):UInt64(2^25-1), 1000)), # equivalent to np.random.randint(0, 2**25, size=1000, dtype='u8').view('M8[m]') + Minute.(rand(UInt64(0):UInt64(2^25-1), 1000)), # equivalent to np.random.randint(0, 2**25, size=1000, dtype='u8').view('m8[m]') + reinterpret(DateTime64{Dates.Nanosecond}, rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[ns]') + Nanosecond.(rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[ns]') + reinterpret(DateTime64{Dates.Minute}, rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[m]') + Minute.(rand(Int64(-(2^63)):Int64(-(2^63)+20), 1000)), # equivalent to np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[m]') + ] + + for codec in codecs + for array in arrays + encoded = Zarr.zencode(array, codec) + decoded = reshape(reinterpret(eltype(array), Zarr.zdecode(encoded, codec)), size(array)) + @test decoded == array + end + end +end From ecdbeea5f9ebdd94f80048fc2eff7730dcaa8fa8 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 23 Aug 2024 15:03:53 -0700 Subject: [PATCH 12/31] Semi working quantize filter --- src/Filters/Filters.jl | 1 + src/Filters/quantize.jl | 13 ++++++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index d6610a8..3161bac 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -86,3 +86,4 @@ include("vlenfilters.jl") include("fletcher32.jl") include("fixedscaleoffset.jl") include("shuffle.jl") +include("quantize.jl") \ No newline at end of file diff --git a/src/Filters/quantize.jl b/src/Filters/quantize.jl index ac4ef61..73693a8 100644 --- a/src/Filters/quantize.jl +++ b/src/Filters/quantize.jl @@ -13,20 +13,23 @@ struct QuantizeFilter{T, TENC} <: Filter{T, TENC} digits::Int32 end -function QuantizeFilter(; digits = 10, T = Float16, Tenc = DecodingType) +function QuantizeFilter(; digits = 10, T = Float16, Tenc = T) return QuantizeFilter{T, Tenc}(digits) end +QuantizeFilter{T, Tenc}(; digits = 10) where {T, Tenc} = QuantizeFilter{T, Tenc}(digits) +QuantizeFilter{T}(; digits = 10) where T = QuantizeFilter{T, T}(digits) + function zencode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} arr = reinterpret(DecodingType, vec(data)) - precision = 10^(-filter.digits) + precision = 10.0^(-filter.digits) - _exponent = log(precision, 10) + _exponent = log(10, precision) # log 10 in base `precision` exponent = _exponent < 0 ? floor(Int, _exponent) : ceil(Int, _exponent) - bits = ceil(log(10^(-exponent), 2)) - scale = 2^bits + bits = ceil(log(2, 10.0^(-exponent))) + scale = 2.0^bits enc = @. round(scale * arr) / scale From 5b8210fdc83f9a1b4266bced60ecb2752bdffd0e Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 23 Aug 2024 15:04:07 -0700 Subject: [PATCH 13/31] Format tests better --- test/Filters.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Filters.jl b/test/Filters.jl index 89656cf..a9b0bbc 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -2,7 +2,7 @@ using Test using Zarr: DateTime64 # for datetime reinterpret using Zarr: zencode, zdecode -using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter +using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFilter @testset "Fletcher32Filter" begin # These tests are copied exactly from the [`numcodecs`](https://github.com/zarr-developers/numcodecs/) Python package, @@ -68,11 +68,11 @@ end ] arrays = [ - Int32.(collect(1:1000)), # equivalent to np.arange(1000, dtype='i4') + Int32.(collect(1:1000)), # equivalent to np.arange(1000, dtype='i4') LinRange(1000, 1001, 1000), # equivalent to np.linspace(1000, 1001, 1000, dtype='f8') reshape(randn(1000) .* 1 .+ 1000, (100, 10)), # equivalent to np.random.normal(loc=1000, scale=1, size=(100, 10)) reshape(rand(Bool, 1000), (10, 100)), # equivalent to np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F') - reshape(rand(Zarr.MaxLengthString{3, UInt8}["a", "bb", "ccc"], 1000), (10, 10, 10)), # equivalent to np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10) + reshape(rand(Zarr.MaxLengthString{3, UInt8}["a", "bb", "ccc"], 1000), (10, 10, 10)), # equivalent to np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10) reinterpret(DateTime64{Dates.Nanosecond}, rand(UInt64(0):UInt64(2^60)-1, 1000)), # equivalent to np.random.randint(0, 2**60, size=1000, dtype='u8').view('M8[ns]') Nanosecond.(rand(UInt64(0):UInt64(2^60-1), 1000)), # equivalent to np.random.randint(0, 2**60, size=1000, dtype='u8').view('m8[ns]') reinterpret(DateTime64{Dates.Minute}, rand(UInt64(0):UInt64(2^25-1), 1000)), # equivalent to np.random.randint(0, 2**25, size=1000, dtype='u8').view('M8[m]') From 16306bedefac232a90ef3e2c4f3febc4e23666b2 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 23 Aug 2024 15:14:10 -0700 Subject: [PATCH 14/31] Complete interface and test quantize --- src/Filters/quantize.jl | 13 +++++++++++- test/Filters.jl | 44 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/Filters/quantize.jl b/src/Filters/quantize.jl index 73693a8..d74e3f5 100644 --- a/src/Filters/quantize.jl +++ b/src/Filters/quantize.jl @@ -40,6 +40,17 @@ function zencode(data::AbstractArray, filter::QuantizeFilter{DecodingType, Encod end end +# Decoding is a no-op; quantization is a lossy filter but data is encoded directly. function zdecode(data::AbstractArray, filter::QuantizeFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} return data -end \ No newline at end of file +end + +function JSON.lower(filter::QuantizeFilter{T, Tenc}) where {T, Tenc} + return Dict("type" => "quantize", "digits" => filter.digits, "dtype" => typestring(T), "atype" => typestring(Tenc)) +end + +function getFilter(::Type{<: QuantizeFilter}, d) + return QuantizeFilter{typestr(d["dtype"], typestr(d["atype"]))}(; digits = d["digits"]) +end + +filterdict["quantize"] = QuantizeFilter \ No newline at end of file diff --git a/test/Filters.jl b/test/Filters.jl index a9b0bbc..a7cd31c 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -91,3 +91,47 @@ end end end end + + +@testset "QuantizeFilter" begin + + codecs = [ + QuantizeFilter{Float64, Float16}(digits=-1), + QuantizeFilter{Float64, Float16}(digits=0), + QuantizeFilter{Float64, Float16}(digits=1), + QuantizeFilter{Float64, Float32}(digits=5), + QuantizeFilter{Float64, Float64}(digits=12), + ] + + arrays = [ + LinRange(100, 200, 1000), # np.linspace(100, 200, 1000, dtype=' Date: Fri, 23 Aug 2024 21:35:38 -0700 Subject: [PATCH 15/31] Uncomment the FixedScaleOffset tests --- test/Filters.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/Filters.jl b/test/Filters.jl index a7cd31c..24ee7d4 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -31,7 +31,6 @@ using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFil @test_throws "Checksum mismatch in Fletcher32 decoding" zdecode(enc, Fletcher32Filter()) end -#= @testset "FixedScaleOffsetFilter" begin arrays = [ LinRange{Float64}(1000, 1001, 1000), @@ -57,7 +56,7 @@ end end end end -=# + @testset "ShuffleFilter" begin codecs = [ From 42995b2eed37f20086b5c21485843b32b1f135a0 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 28 Aug 2024 16:51:06 -0700 Subject: [PATCH 16/31] fix getfilter syntax --- src/Filters/fixedscaleoffset.jl | 2 +- src/Filters/fletcher32.jl | 2 +- src/Filters/quantize.jl | 2 +- src/Filters/shuffle.jl | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl index 80be305..66582a9 100644 --- a/src/Filters/fixedscaleoffset.jl +++ b/src/Filters/fixedscaleoffset.jl @@ -27,7 +27,7 @@ function zdecode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, end -function getFilter(::Type{<: FixedScaleOffsetFilter}, d::Dict) +function getfilter(::Type{<: FixedScaleOffsetFilter}, d::Dict) scale = d["scale"] offset = d["offset"] # Types must be converted from strings to the actual Julia types they represent. diff --git a/src/Filters/fletcher32.jl b/src/Filters/fletcher32.jl index 4d2bda7..3241894 100644 --- a/src/Filters/fletcher32.jl +++ b/src/Filters/fletcher32.jl @@ -19,7 +19,7 @@ the checksum and cropping the last 4 bytes of the data during decoding. struct Fletcher32Filter <: Filter{UInt8, UInt8} end -getFilter(::Type{<: Fletcher32Filter}, d::Dict) = Fletcher32Filter() +getfilter(::Type{<: Fletcher32Filter}, d::Dict) = Fletcher32Filter() JSON.lower(::Fletcher32Filter) = Dict("id" => "fletcher32") filterdict["fletcher32"] = Fletcher32Filter diff --git a/src/Filters/quantize.jl b/src/Filters/quantize.jl index d74e3f5..ea00dcb 100644 --- a/src/Filters/quantize.jl +++ b/src/Filters/quantize.jl @@ -49,7 +49,7 @@ function JSON.lower(filter::QuantizeFilter{T, Tenc}) where {T, Tenc} return Dict("type" => "quantize", "digits" => filter.digits, "dtype" => typestring(T), "atype" => typestring(Tenc)) end -function getFilter(::Type{<: QuantizeFilter}, d) +function getfilter(::Type{<: QuantizeFilter}, d) return QuantizeFilter{typestr(d["dtype"], typestr(d["atype"]))}(; digits = d["digits"]) end diff --git a/src/Filters/shuffle.jl b/src/Filters/shuffle.jl index 0dc49a7..6a01f5d 100644 --- a/src/Filters/shuffle.jl +++ b/src/Filters/shuffle.jl @@ -52,7 +52,7 @@ function zdecode(a::AbstractArray, c::ShuffleFilter) return dest end -function getFilter(::Type{ShuffleFilter}, d::Dict) +function getfilter(::Type{ShuffleFilter}, d::Dict) return ShuffleFilter(d["elementsize"]) end From 594ffdcaf59075f4cd81144505699f10b2b23528 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Thu, 29 Aug 2024 15:58:05 -0700 Subject: [PATCH 17/31] Add delta filter --- src/Filters/Filters.jl | 3 ++- src/Filters/delta.jl | 45 ++++++++++++++++++++++++++++++++++++++++++ test/Filters.jl | 29 ++++++++++++++++++++++++++- 3 files changed, 75 insertions(+), 2 deletions(-) create mode 100644 src/Filters/delta.jl diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index 3161bac..7f7a394 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -86,4 +86,5 @@ include("vlenfilters.jl") include("fletcher32.jl") include("fixedscaleoffset.jl") include("shuffle.jl") -include("quantize.jl") \ No newline at end of file +include("quantize.jl") +include("delta.jl") diff --git a/src/Filters/delta.jl b/src/Filters/delta.jl new file mode 100644 index 0000000..ccc7cf5 --- /dev/null +++ b/src/Filters/delta.jl @@ -0,0 +1,45 @@ +#= +# Delta compression + + +=# + +""" + DeltaFilter(; DecodingType, [EncodingType = DecodingType]) + +Delta-based compression for Zarr arrays. (Delta encoding is Julia `diff`, decoding is Julia `cumsum`). +""" +struct DeltaFilter{T, TENC} <: Filter{T, TENC} +end + +function DeltaFilter(; DecodingType = Float16, EncodingType = DecodingType) + return DeltaFilter{DecodingType, EncodingType}() +end + +DeltaFilter{T}() where T = DeltaFilter{T, T}() + +function zencode(data::AbstractArray, filter::DeltaFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + arr = reinterpret(DecodingType, vec(data)) + + enc = similar(arr, EncodingType) + # perform the delta operation + enc[begin] = arr[begin] + enc[begin+1:end] .= diff(arr) + return enc +end + +function zdecode(data::AbstractArray, filter::DeltaFilter{DecodingType, EncodingType}) where {DecodingType, EncodingType} + encoded = reinterpret(EncodingType, vec(data)) + decoded = DecodingType.(cumsum(encoded)) + return decoded +end + +function JSON.lower(filter::DeltaFilter{T, Tenc}) where {T, Tenc} + return Dict("type" => "delta", "dtype" => typestring(T), "atype" => typestring(Tenc)) +end + +function getfilter(::Type{<: DeltaFilter}, d) + return DeltaFilter{typestr(d["dtype"], haskey(d, "atype") ? typestr(d["atype"]) : d["dtype"])}() +end + +filterdict["delta"] = DeltaFilter \ No newline at end of file diff --git a/test/Filters.jl b/test/Filters.jl index 24ee7d4..1bec170 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -2,7 +2,7 @@ using Test using Zarr: DateTime64 # for datetime reinterpret using Zarr: zencode, zdecode -using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFilter +using Zarr: Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFilter, DeltaFilter @testset "Fletcher32Filter" begin # These tests are copied exactly from the [`numcodecs`](https://github.com/zarr-developers/numcodecs/) Python package, @@ -133,4 +133,31 @@ end end end end +end + +@testset "DeltaFilter" begin + + arrays = [ + Int32.(collect(0:999)), # np.arange(1000, dtype=' Date: Wed, 4 Sep 2024 11:44:10 -0700 Subject: [PATCH 18/31] Adapt for Kerchunk playing fast and loose with the spec - Kerchunk often encodes the compressor as the last filter, so we check that the compressor isn't hiding in the filters array if the compressor is null. - Similarly, the dtype is often unknown in this case, or the transform is not encoded correctly, so we ensure that the datatypes of `data` and `a2` remain the same by reinterpreting. --- src/Compressors.jl | 4 ++-- src/metadata.jl | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/Compressors.jl b/src/Compressors.jl index b54e97a..ebf0a81 100644 --- a/src/Compressors.jl +++ b/src/Compressors.jl @@ -30,11 +30,11 @@ function zcompress!(compressed, data, c, f) end function zuncompress!(data, compressed, c, f) - data2 = zuncompress(compressed, c, desttype(last(f))) + data2 = zuncompress(compressed, c, desttype(last(f))) a2 = foldr(f, init = data2) do fnow, anow zdecode(anow, fnow) end - copyto!(data, a2) + copyto!(data, _reinterpret(Base.nonmissingtype(eltype(data)), a2)) end diff --git a/src/metadata.jl b/src/metadata.jl index f3dc5df..51bb382 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -156,6 +156,12 @@ function Metadata(d::AbstractDict, fill_as_missing) # create a Metadata struct from it compdict = d["compressor"] + if isnothing(compdict) + # try the last filter, for Kerchunk compat + if !isnothing(d["filters"]) && haskey(compressortypes, d["filters"][end]["id"]) + compdict = pop!(d["filters"]) # TODO: this will not work with JSON3! + end + end compressor = getCompressor(compdict) filters = getfilters(d) From 7518c43caa3525b710ded083efc8bc18f6d9c62d Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 9 Oct 2024 15:47:08 -0700 Subject: [PATCH 19/31] Fix the delta and quantize JSON.lower --- src/Filters/delta.jl | 2 +- src/Filters/quantize.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Filters/delta.jl b/src/Filters/delta.jl index ccc7cf5..f7cdc3d 100644 --- a/src/Filters/delta.jl +++ b/src/Filters/delta.jl @@ -35,7 +35,7 @@ function zdecode(data::AbstractArray, filter::DeltaFilter{DecodingType, Encoding end function JSON.lower(filter::DeltaFilter{T, Tenc}) where {T, Tenc} - return Dict("type" => "delta", "dtype" => typestring(T), "atype" => typestring(Tenc)) + return Dict("id" => "delta", "dtype" => typestr(T), "atype" => typestr(Tenc)) end function getfilter(::Type{<: DeltaFilter}, d) diff --git a/src/Filters/quantize.jl b/src/Filters/quantize.jl index ea00dcb..12ad9ee 100644 --- a/src/Filters/quantize.jl +++ b/src/Filters/quantize.jl @@ -46,7 +46,7 @@ function zdecode(data::AbstractArray, filter::QuantizeFilter{DecodingType, Encod end function JSON.lower(filter::QuantizeFilter{T, Tenc}) where {T, Tenc} - return Dict("type" => "quantize", "digits" => filter.digits, "dtype" => typestring(T), "atype" => typestring(Tenc)) + return Dict("type" => "quantize", "digits" => filter.digits, "dtype" => typestr(T), "atype" => typestr(Tenc)) end function getfilter(::Type{<: QuantizeFilter}, d) From a3c7710ce9c126bdcbfe56b2e1bebf237d3d6fdf Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 9 Oct 2024 15:47:33 -0700 Subject: [PATCH 20/31] Change the tests to be more sensible/Julian and avoid truncation errors --- test/Filters.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Filters.jl b/test/Filters.jl index 1bec170..6a0a271 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -141,12 +141,12 @@ end Int32.(collect(0:999)), # np.arange(1000, dtype=' Date: Wed, 9 Oct 2024 15:47:57 -0700 Subject: [PATCH 21/31] Fix the FixedScaleOffset filter materializer --- src/Filters/fixedscaleoffset.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl index 66582a9..b48dbc4 100644 --- a/src/Filters/fixedscaleoffset.jl +++ b/src/Filters/fixedscaleoffset.jl @@ -3,6 +3,11 @@ FixedScaleOffsetFilter{T,TENC}(scale, offset) A compressor that scales and offsets the data. + +!!! note + The geographic CF standards define scale/offset decoding as `x * scale + offset`, + but this filter defines it as `x / scale + offset`. Constructing a `FixedScaleOffsetFilter` + from CF data means `FixedScaleOffsetFilter(1/cf_scale_factor, cf_add_offset)`. """ struct FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc} <: Filter{T, Tenc} scale::ScaleOffsetType @@ -35,7 +40,7 @@ function getfilter(::Type{<: FixedScaleOffsetFilter}, d::Dict) string_Tenc = get(d, "atype", string_T) T = typestr(string_T) Tenc = typestr(string_Tenc) - return FixedScaleOffsetFilter{T, Tenc}(scale, offset) + return FixedScaleOffsetFilter{Tenc, T, Tenc}(scale, offset) end function JSON.lower(c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {ScaleOffsetType, T, Tenc} From c211b6f65428098f136bf350fe679432cb8749aa Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 9 Oct 2024 15:48:15 -0700 Subject: [PATCH 22/31] Fix decoding for fill values to use `reinterpret` on unsigned -> integer --- src/metadata.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/metadata.jl b/src/metadata.jl index 51bb382..b677cc0 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -222,5 +222,6 @@ Base.eltype(::Metadata{T}) where T = T fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) fill_value_decoding(v::Nothing, ::Any) = v fill_value_decoding(v, T) = T(v) +fill_value_decoding(v::Integer, T::Type{<: Unsigned}) = reinterpret(T, signed(T)(v)) fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v]) fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v From 086b3b8699aaca90008c4bd3d6cf29ad3f87a7a1 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 9 Oct 2024 15:48:38 -0700 Subject: [PATCH 23/31] If `getfilter` fails, show the filter name and then throw an error --- src/Filters/Filters.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl index 7f7a394..829f31f 100644 --- a/src/Filters/Filters.jl +++ b/src/Filters/Filters.jl @@ -72,7 +72,12 @@ function getfilters(d::Dict) return nothing end f = map(d["filters"]) do f + try getfilter(filterdict[f["id"]], f) + catch e + @show f + rethrow(e) + end end return (f...,) end From ffdc62929d23af3cdc601faebf00dd229bbc7748 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Mon, 21 Oct 2024 14:06:23 -0700 Subject: [PATCH 24/31] Apply reinterpret before multiplication in fixed-scale-offset filter --- src/Filters/fixedscaleoffset.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl index b48dbc4..906ab19 100644 --- a/src/Filters/fixedscaleoffset.jl +++ b/src/Filters/fixedscaleoffset.jl @@ -28,7 +28,8 @@ function zencode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, end function zdecode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} - return _reinterpret(Base.nonmissingtype(T), @. a / c.scale + c.offset) + data = _reinterpret(Base.nonmissingtype(T), a) + return @. (data / c.scale) + c.offset end From 24a68e6e369f494898492ac6d379f5d3169503f4 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Mon, 21 Oct 2024 14:07:00 -0700 Subject: [PATCH 25/31] Only reinterpret negative integers when decoding fill values to unsigned --- src/metadata.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/metadata.jl b/src/metadata.jl index b677cc0..456993b 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -222,6 +222,9 @@ Base.eltype(::Metadata{T}) where T = T fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) fill_value_decoding(v::Nothing, ::Any) = v fill_value_decoding(v, T) = T(v) -fill_value_decoding(v::Integer, T::Type{<: Unsigned}) = reinterpret(T, signed(T)(v)) +# Sometimes, unsigned values are represented as signed integers in strings. +# If the value is negative, then we know it needs reinterpretation, +# but if the value is positive, there is no difference between a signed and unsigned integer. +fill_value_decoding(v::Integer, T::Type{<: Unsigned}) = sign(v) < 0 ? reinterpret(T, signed(T)(v)) : T(v) fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v]) fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v From 85c1189fb0d81e268c62bcbb2e0f832ce0a9f0c0 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Mon, 21 Oct 2024 14:10:11 -0700 Subject: [PATCH 26/31] Revert "Only reinterpret negative integers when decoding fill values to unsigned" This reverts commit 24a68e6e369f494898492ac6d379f5d3169503f4. --- src/metadata.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/metadata.jl b/src/metadata.jl index 456993b..b677cc0 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -222,9 +222,6 @@ Base.eltype(::Metadata{T}) where T = T fill_value_decoding(v::AbstractString, T::Type{<:Number}) = parse(T, v) fill_value_decoding(v::Nothing, ::Any) = v fill_value_decoding(v, T) = T(v) -# Sometimes, unsigned values are represented as signed integers in strings. -# If the value is negative, then we know it needs reinterpretation, -# but if the value is positive, there is no difference between a signed and unsigned integer. -fill_value_decoding(v::Integer, T::Type{<: Unsigned}) = sign(v) < 0 ? reinterpret(T, signed(T)(v)) : T(v) +fill_value_decoding(v::Integer, T::Type{<: Unsigned}) = reinterpret(T, signed(T)(v)) fill_value_decoding(v::Number, T::Type{String}) = v == 0 ? "" : T(UInt8[v]) fill_value_decoding(v, ::Type{ASCIIChar}) = v == "" ? nothing : v From 3fca4eb37e010e7d0df1cde11d35332527d6cb7e Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 22 Nov 2024 10:39:40 -0500 Subject: [PATCH 27/31] let Fletcher32 operate on n-dimensional arrays not just vectors, as it was previously constrained to --- src/Filters/fletcher32.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Filters/fletcher32.jl b/src/Filters/fletcher32.jl index 3241894..d854cb9 100644 --- a/src/Filters/fletcher32.jl +++ b/src/Filters/fletcher32.jl @@ -23,8 +23,8 @@ getfilter(::Type{<: Fletcher32Filter}, d::Dict) = Fletcher32Filter() JSON.lower(::Fletcher32Filter) = Dict("id" => "fletcher32") filterdict["fletcher32"] = Fletcher32Filter -function _checksum_fletcher32(data::AbstractVector{UInt8}) - len = length(data) / 2 # length in 16-bit words +function _checksum_fletcher32(data::AbstractArray{UInt8}) + len = length(data) ÷ 2 # length in 16-bit words sum1::UInt32 = 0 sum2::UInt32 = 0 data_idx = 1 @@ -62,7 +62,7 @@ function _checksum_fletcher32(data::AbstractVector{UInt8}) end function zencode(data, ::Fletcher32Filter) - bytes = reinterpret(UInt8, data) + bytes = reinterpret(UInt8, vec(data)) checksum = _checksum_fletcher32(bytes) result = copy(bytes) append!(result, reinterpret(UInt8, [checksum])) # TODO: decompose this without the extra allocation of wrapping in Array From fdb5defbb6f9f121f1086cf1126b983d0915ce34 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 22 Nov 2024 10:40:39 -0500 Subject: [PATCH 28/31] fix FixedScaleOffset in many ways - Never use reinterpret - use array comprehensions to support 0-dimensional arrays correctly, the performance impact is negligible based on testing - only round if the target type is an integer, otherwise let it be if it's a float. --- src/Filters/fixedscaleoffset.jl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl index 906ab19..1924ca1 100644 --- a/src/Filters/fixedscaleoffset.jl +++ b/src/Filters/fixedscaleoffset.jl @@ -22,14 +22,15 @@ function FixedScaleOffsetFilter(; scale::ScaleOffsetType, offset::ScaleOffsetTyp end function zencode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} - return @. convert(Tenc, # convert to the encoding type after applying the scale and offset - round((a - c.offset) * c.scale) # apply scale and offset, and round to nearest integer - ) + if Tenc <: Integer + return [round(Tenc, (a - c.offset) * c.scale) for a in a] # apply scale and offset, and round to nearest integer + else + return [convert(Tenc, (a - c.offset) * c.scale) for a in a] # apply scale and offset + end end function zdecode(a::AbstractArray, c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {T, Tenc, ScaleOffsetType} - data = _reinterpret(Base.nonmissingtype(T), a) - return @. (data / c.scale) + c.offset + return [convert(Base.nonmissingtype(T), (a / c.scale) + c.offset) for a in a] end From cf602425ea8697df605bb7aebe954bb83b8b279b Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 22 Nov 2024 11:01:48 -0500 Subject: [PATCH 29/31] add filter tests in Python --- test/python.jl | 60 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/test/python.jl b/test/python.jl index 86a72ca..4a650e3 100644 --- a/test/python.jl +++ b/test/python.jl @@ -22,13 +22,16 @@ groupattrs = Dict("String attribute"=>"One", "Int attribute"=>5, "Float attribut g = zgroup(pjulia,attrs=groupattrs) # Test all supported data types and compressors -import Zarr: NoCompressor, BloscCompressor, ZlibCompressor, MaxLengthString +import Zarr: NoCompressor, BloscCompressor, ZlibCompressor, MaxLengthString, + Fletcher32Filter, FixedScaleOffsetFilter, ShuffleFilter, QuantizeFilter, DeltaFilter using Random: randstring -dtypes = (UInt8, UInt16, UInt32, UInt64, +numeric_dtypes = (UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float16, Float32, Float64, Complex{Float32}, Complex{Float64}, - Bool,MaxLengthString{10,UInt8},MaxLengthString{10,UInt32}, + Bool,) +dtypes = (numeric_dtypes..., + MaxLengthString{10,UInt8},MaxLengthString{10,UInt32}, String) compressors = ( "no"=>NoCompressor(), @@ -37,9 +40,17 @@ compressors = ( "blosc_noshuffle"=>BloscCompressor(cname="zstd",shuffle=0), "blosc_bitshuffle"=>BloscCompressor(cname="zstd",shuffle=2), "zlib"=>ZlibCompressor()) +filters = ( + "fletcher32"=>Fletcher32Filter(), + "scale_offset"=>FixedScaleOffsetFilter(offset=1000, scale=10^6, T=Float64, Tenc=Int32), + "shuffle"=>ShuffleFilter(elementsize=4), + "quantize"=>QuantizeFilter{Float64,Float32}(digits=5), + "delta"=>DeltaFilter{Int32}() +) testarrays = Dict(t=>(t<:AbstractString) ? [randstring(maximum(i.I)) for i in CartesianIndices((1:10,1:6,1:2))] : rand(t,10,6,2) for t in dtypes) testzerodimarrays = Dict(t=>(t<:AbstractString) ? randstring(10) : rand(t) for t in dtypes) +# Test arrays with compressors for t in dtypes, co in compressors compstr, comp = co att = Dict("This is a nested attribute"=>Dict("a"=>5)) @@ -49,6 +60,21 @@ for t in dtypes, co in compressors a = zcreate(t, g,string("azerodim",t,compstr), compressor=comp) a[] = testzerodimarrays[t] end + +# Test arrays with filters +for (filterstr, filter) in filters + t = eltype(filter) == Any ? Float64 : eltype(filter) + att = Dict("Filter test attribute"=>Dict("b"=>6)) + a = zcreate(t, g,string("filter_",filterstr),10,6,2,attrs=att, chunks = (5,2,2),filters=[filter]) + testdata = rand(t,10,6,2) + a[:,:,:] = testdata + + # Test zero-dimensional array + a = zcreate(t, g,string("filter_zerodim_",filterstr), filters=[filter]) + testzerodim = rand(t) + a[] = testzerodim +end + #Also save as zip file. open(pjulia*".zip";write=true) do io Zarr.writezip(io, g) @@ -58,6 +84,7 @@ end for julia_path in (pjulia, pjulia*".zip") py""" import zarr +import numcodecs g = zarr.open_group($julia_path) gatts = g.attrs """ @@ -67,7 +94,6 @@ gatts = g.attrs @test py"gatts['Int attribute']" == 5 @test py"gatts['Float attribute']" == 10.5 - dtypesp = ("uint8","uint16","uint32","uint64", "int8","int16","int32","int64", "float16","float32","float64", @@ -95,6 +121,30 @@ for i=1:length(dtypes), co in compressors end end +# Test reading filtered arrays from python +for (filterstr, filter) in filters + t = eltype(filter) == Any ? Float64 : eltype(filter) + arname = string("filter_",filterstr) + try + py""" + ar=g[$arname] + """ + catch e + @error "Error loading group with filter $filterstr" exception=(e,catch_backtrace()) + @test false # test failed. + end + + @test py"ar.attrs['Filter test attribute']" == Dict("b"=>6) + @test py"ar.shape" == (2,6,10) + + # Test zero-dimensional filtered array + arname = string("filter_zerodim_",filterstr) + py""" + ar_zero=g[$arname] + """ + @test py"ar_zero.shape" == () +end + for i=1:length(dtypes), co in compressors compstr,comp = co t = dtypes[i] @@ -244,6 +294,4 @@ for unit in ["Week", "Day", "Hour", "Minute", "Second", @test_py np.datetime64(g_julia[unit][100] |> DateTime |> string) == get(getproperty(g_python,unit),99) end - - end From 1fe11f629f4d42123e5bc369a26461da7667ed62 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 22 Nov 2024 11:02:53 -0500 Subject: [PATCH 30/31] Fix filter astype, id to conform to Python names --- src/Filters/delta.jl | 4 ++-- src/Filters/fixedscaleoffset.jl | 4 ++-- src/Filters/quantize.jl | 12 ++++-------- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/Filters/delta.jl b/src/Filters/delta.jl index f7cdc3d..9d1de04 100644 --- a/src/Filters/delta.jl +++ b/src/Filters/delta.jl @@ -35,11 +35,11 @@ function zdecode(data::AbstractArray, filter::DeltaFilter{DecodingType, Encoding end function JSON.lower(filter::DeltaFilter{T, Tenc}) where {T, Tenc} - return Dict("id" => "delta", "dtype" => typestr(T), "atype" => typestr(Tenc)) + return Dict("id" => "delta", "dtype" => typestr(T), "astype" => typestr(Tenc)) end function getfilter(::Type{<: DeltaFilter}, d) - return DeltaFilter{typestr(d["dtype"], haskey(d, "atype") ? typestr(d["atype"]) : d["dtype"])}() + return DeltaFilter{typestr(d["dtype"], haskey(d, "astype") ? typestr(d["astype"]) : d["dtype"])}() end filterdict["delta"] = DeltaFilter \ No newline at end of file diff --git a/src/Filters/fixedscaleoffset.jl b/src/Filters/fixedscaleoffset.jl index 1924ca1..9e12c52 100644 --- a/src/Filters/fixedscaleoffset.jl +++ b/src/Filters/fixedscaleoffset.jl @@ -39,14 +39,14 @@ function getfilter(::Type{<: FixedScaleOffsetFilter}, d::Dict) offset = d["offset"] # Types must be converted from strings to the actual Julia types they represent. string_T = d["dtype"] - string_Tenc = get(d, "atype", string_T) + string_Tenc = get(d, "astype", string_T) T = typestr(string_T) Tenc = typestr(string_Tenc) return FixedScaleOffsetFilter{Tenc, T, Tenc}(scale, offset) end function JSON.lower(c::FixedScaleOffsetFilter{ScaleOffsetType, T, Tenc}) where {ScaleOffsetType, T, Tenc} - return Dict("id" => "fixedscaleoffset", "scale" => c.scale, "offset" => c.offset, "dtype" => typestr(T), "atype" => typestr(Tenc)) + return Dict("id" => "fixedscaleoffset", "scale" => c.scale, "offset" => c.offset, "dtype" => typestr(T), "astype" => typestr(Tenc)) end filterdict["fixedscaleoffset"] = FixedScaleOffsetFilter diff --git a/src/Filters/quantize.jl b/src/Filters/quantize.jl index 12ad9ee..c5d7c9a 100644 --- a/src/Filters/quantize.jl +++ b/src/Filters/quantize.jl @@ -31,13 +31,9 @@ function zencode(data::AbstractArray, filter::QuantizeFilter{DecodingType, Encod bits = ceil(log(2, 10.0^(-exponent))) scale = 2.0^bits - enc = @. round(scale * arr) / scale + enc = @. convert(EncodingType, round(scale * arr) / scale) - if EncodingType == DecodingType - return enc - else - return reinterpret(EncodingType, enc) - end + return enc end # Decoding is a no-op; quantization is a lossy filter but data is encoded directly. @@ -46,11 +42,11 @@ function zdecode(data::AbstractArray, filter::QuantizeFilter{DecodingType, Encod end function JSON.lower(filter::QuantizeFilter{T, Tenc}) where {T, Tenc} - return Dict("type" => "quantize", "digits" => filter.digits, "dtype" => typestr(T), "atype" => typestr(Tenc)) + return Dict("id" => "quantize", "digits" => filter.digits, "dtype" => typestr(T), "astype" => typestr(Tenc)) end function getfilter(::Type{<: QuantizeFilter}, d) - return QuantizeFilter{typestr(d["dtype"], typestr(d["atype"]))}(; digits = d["digits"]) + return QuantizeFilter{typestr(d["dtype"], typestr(d["astype"]))}(; digits = d["digits"]) end filterdict["quantize"] = QuantizeFilter \ No newline at end of file From 4ca87a6a428fdb7e9c8d4ce615d3814ec9cf6878 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 22 Nov 2024 11:03:07 -0500 Subject: [PATCH 31/31] remove encoding validity check for quantize - it's pointless --- test/Filters.jl | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/test/Filters.jl b/test/Filters.jl index 6a0a271..f46cf4a 100644 --- a/test/Filters.jl +++ b/test/Filters.jl @@ -110,17 +110,7 @@ end reshape(LinRange(100, 200, 1000), (10, 10, 10)), # np.linspace(100, 200, 1000, dtype='