From 9a8892adc8df0560852b98edcdae9cd9ac9562ee Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Mon, 21 Oct 2024 04:57:42 -0700 Subject: [PATCH] Refactor compressors to be in separate files (#153) * Refactor compressors into multiple files Refactors each compressor into a single file and the abstract API + NoCompressor to `Compressors/Compressors.jl`. Also adds API docs for the Compressor API in case people want to do that. Future work may also explore making this compliant with Interfaces.jl so that we have a way to test that all compressors are compliant. * Get docs building again TODO: there must be a better solution than this! * Implement and test `fletcher32` compression * Fix tests by explicitly importing * Revert "Fix tests by explicitly importing" This reverts commit df8ed7a4d3cfd8a951e6431eda7b1a2fa05c4d67. * Revert "Implement and test `fletcher32` compression" This reverts commit d7f2a6912709c775de473e8722ef9b68ebbd8d2b. * Update src/Compressors/Compressors.jl Co-authored-by: Anshul Singhvi --------- Co-authored-by: Fabian Gans --- docs/src/reference.md | 2 +- src/Compressors.jl | 151 --------------------------------- src/Compressors/Compressors.jl | 106 +++++++++++++++++++++++ src/Compressors/blosc.jl | 70 +++++++++++++++ src/Compressors/zlib.jl | 38 +++++++++ src/Zarr.jl | 2 +- 6 files changed, 216 insertions(+), 153 deletions(-) delete mode 100644 src/Compressors.jl create mode 100644 src/Compressors/Compressors.jl create mode 100644 src/Compressors/blosc.jl create mode 100644 src/Compressors/zlib.jl diff --git a/docs/src/reference.md b/docs/src/reference.md index f9d4402..4cf889a 100644 --- a/docs/src/reference.md +++ b/docs/src/reference.md @@ -18,5 +18,5 @@ Pages = ["ZGroup.jl"] ```@autodocs Modules = [Zarr] -Pages = ["Compressors.jl"] +Pages = ["Compressors/Compressors.jl", "Compressors/blosc.jl", "Compressors/zlib.jl"] ``` diff --git a/src/Compressors.jl b/src/Compressors.jl deleted file mode 100644 index b54e97a..0000000 --- a/src/Compressors.jl +++ /dev/null @@ -1,151 +0,0 @@ -import Blosc -import CodecZlib -import JSON - -_reinterpret(::Type{T}, x::AbstractArray{S, 0}) where {T, S} = reinterpret(T, reshape(x, 1)) -_reinterpret(::Type{T}, x::AbstractArray) where T = reinterpret(T, x) - -abstract type Compressor end -getCompressor(compdict::Dict) = getCompressor(compressortypes[compdict["id"]],compdict) -getCompressor(::Nothing) = NoCompressor() - -#Compression when no filter is given -zcompress!(compressed,data,c,::Nothing) = zcompress!(compressed,data,c) -zuncompress!(data,compressed,c,::Nothing) = zuncompress!(data,compressed,c) - -#Fallback definition of mutating form of compress and uncompress -function zcompress!(compressed, data, c) - empty!(compressed) - append!(compressed,zcompress(data, c)) -end -zuncompress!(data, compressed, c) = copyto!(data, zuncompress(compressed, c, eltype(data))) - - -#Function given a filter stack -function zcompress!(compressed, data, c, f) - a2 = foldl(f, init=data) do anow, fnow - zencode(anow,fnow) - end - zcompress!(compressed, a2, c) -end - -function zuncompress!(data, compressed, c, f) - data2 = zuncompress(compressed, c, desttype(last(f))) - a2 = foldr(f, init = data2) do fnow, anow - zdecode(anow, fnow) - end - copyto!(data, a2) -end - - -struct BloscCompressor <: Compressor - blocksize::Int - clevel::Int - cname::String - shuffle::Int -end - -""" - BloscCompressor(;blocksize=0, clevel=5, cname="lz4", shuffle=1) - -Returns a `BloscCompressor` struct that can serve as a Zarr array compressor. Keyword arguments are: - -* `clevel=5` the compression level, number between 0 (no compression) and 9 (max compression) -* `cname="lz4"` compressor name, can be one of `"blosclz"`, `"lz4"`, and `"lz4hc"` -* `shuffle=1` Either NOSHUFFLE (0), SHUFFLE (1), BITSHUFFLE (2) or AUTOSHUFFLE (-1). - If AUTOSHUFFLE, bit-shuffle will be used for buffers with itemsize 1, and byte-shuffle will be used otherwise. The default is SHUFFLE. -""" -BloscCompressor(;blocksize=0, clevel=5, cname="lz4", shuffle=1) = - BloscCompressor(blocksize, clevel, cname, shuffle) - -function getCompressor(::Type{BloscCompressor}, d::Dict) - BloscCompressor(d["blocksize"], d["clevel"], d["cname"], d["shuffle"]) -end - -zuncompress(a, ::BloscCompressor, T) = Blosc.decompress(Base.nonmissingtype(T), a) - -function zuncompress!(data::DenseArray, compressed, ::BloscCompressor) - Blosc.decompress!(vec(data),compressed) - # if Int(pointer(data,length(data))-pointer(data)) != (length(data)-1)*sizeof(eltype(data)) - # @show size(data) - # @show size(parent(data)) - # @show typeof(data) - # @show Int(pointer(data,length(data))-pointer(data)) - # @show (length(data)-1)*sizeof(eltype(data)) - # error("Something is wrong") - # end - # Zarr.Blosc.blosc_decompress(data, compressed, sizeof(data)) -end - - -function zcompress(a, c::BloscCompressor) - itemsize = sizeof(eltype(a)) - shuffle = c.shuffle - # Weird auto shuffle logic from - # https://github.com/zarr-developers/numcodecs/blob/7d8f9762b4f0f9b5e135688b2eeb3f783f90f208/numcodecs/blosc.pyx#L264-L272 - if shuffle == -1 - if itemsize == 1 - shuffle = Blosc.BITSHUFFLE - else - shuffle = Blosc.SHUFFLE - end - elseif shuffle ∉ (Blosc.NOSHUFFLE, Blosc.SHUFFLE, Blosc.BITSHUFFLE) - throw(ArgumentError("invalid shuffle argument; expected -1, 0, 1 or 2, found $shuffle")) - end - Blosc.set_compressor(c.cname) - Blosc.compress(a; level=c.clevel, shuffle=shuffle) -end - -JSON.lower(c::BloscCompressor) = Dict("id"=>"blosc", "cname"=>c.cname, - "clevel"=>c.clevel, "shuffle"=>c.shuffle, "blocksize"=>c.blocksize) - -""" - NoCompressor() - -Creates an object that can be passed to ZArray constructors without compression. -""" -struct NoCompressor <: Compressor end - -function zuncompress(a, ::NoCompressor, T) - _reinterpret(T,a) -end - -function zcompress(a, ::NoCompressor) - _reinterpret(UInt8,a) -end - -JSON.lower(::NoCompressor) = nothing - -compressortypes = Dict("blosc"=>BloscCompressor, nothing=>NoCompressor) - - - -""" - ZlibCompressor(clevel=-1) -Returns a `ZlibCompressor` struct that can serve as a Zarr array compressor. Keyword arguments are: -* `clevel=-1` the compression level, number between -1 (Default), 0 (no compression) and 9 (max compression) -* default is -1 compromise between speed and compression (currently equivalent to level 6). -""" -struct ZlibCompressor <: Compressor - clevel::Int -end - -ZlibCompressor(;clevel=-1) = ZlibCompressor(clevel) - -function getCompressor(::Type{ZlibCompressor}, d::Dict) - ZlibCompressor(d["level"]) -end - -function zuncompress(a, ::ZlibCompressor, T) - result = transcode(CodecZlib.ZlibDecompressor,a) - _reinterpret(Base.nonmissingtype(T),result) -end - -function zcompress(a, ::ZlibCompressor) - a_uint8 = _reinterpret(UInt8,a)[:] - transcode(CodecZlib.ZlibCompressor, a_uint8) -end - -JSON.lower(z::ZlibCompressor) = Dict("id"=>"zlib", "level" => z.clevel) - -Zarr.compressortypes["zlib"] = ZlibCompressor diff --git a/src/Compressors/Compressors.jl b/src/Compressors/Compressors.jl new file mode 100644 index 0000000..e676c95 --- /dev/null +++ b/src/Compressors/Compressors.jl @@ -0,0 +1,106 @@ +import JSON # for JSON.lower + +_reinterpret(::Type{T}, x::AbstractArray{S, 0}) where {T, S} = reinterpret(T, reshape(x, 1)) +_reinterpret(::Type{T}, x::AbstractArray) where T = reinterpret(T, x) + +""" + abstract type Compressor + +The abstract supertype for all Zarr compressors. + +## Interface + +All subtypes of `Compressor` SHALL implement the following methods: + +- `zcompress(a, c::Compressor)`: compress the array `a` using the compressor `c`. +- `zuncompress(a, c::Compressor, T)`: uncompress the array `a` using the compressor `c` + and return an array of type `T`. +- `JSON.lower(c::Compressor)`: return a JSON representation of the compressor `c`, which + follows the Zarr specification for that compressor. +- `getCompressor(::Type{<:Compressor}, d::Dict)`: return a compressor object from a given + dictionary `d` which contains the compressor's parameters according to the Zarr spec. + +Subtypes of `Compressor` MAY also implement the following methods: + +- `zcompress!(compressed, data, c::Compressor)`: compress the array `data` using the + compressor `c` and store the result in the array `compressed`. +- `zuncompress!(data, compressed, c::Compressor)`: uncompress the array `compressed` + using the compressor `c` and store the result in the array `data`. + +Finally, an entry MUST be added to the `compressortypes` dictionary for each compressor type. +This must also follow the Zarr specification's name for that compressor. The name of the compressor +is the key, and the value is the compressor type (e.g. `BloscCompressor` or `NoCompressor`). + +For example, the Blosc compressor is named "blosc" in the Zarr spec, so the entry for [`BloscCompressor`](@ref) +must be added to `compressortypes` as `compressortypes["blosc"] = BloscCompressor`. +""" +abstract type Compressor end + +const compressortypes = Dict{Union{String,Nothing}, Type{<: Compressor}}() + +# function getCompressor end +# function zcompress end +# function zuncompress end +# function zcompress! end +# function zuncompress! end +# JSON.lower is neither defined nor documented here, since that would be documentation piracy :yarr: + +# Include the compressor implementations +include("blosc.jl") +include("zlib.jl") + +# ## Fallback definitions for the compressor interface +# Define fallbacks and generic methods for the compressor interface +getCompressor(compdict::Dict) = getCompressor(compressortypes[compdict["id"]],compdict) +getCompressor(::Nothing) = NoCompressor() + +# Compression when no filter is given +zcompress!(compressed,data,c,::Nothing) = zcompress!(compressed,data,c) +zuncompress!(data,compressed,c,::Nothing) = zuncompress!(data,compressed,c) + +# Fallback definition of mutating form of compress and uncompress +function zcompress!(compressed, data, c) + empty!(compressed) + append!(compressed,zcompress(data, c)) +end +zuncompress!(data, compressed, c) = copyto!(data, zuncompress(compressed, c, eltype(data))) + + +# Function given a filter stack +function zcompress!(compressed, data, c, f) + a2 = foldl(f, init=data) do anow, fnow + zencode(anow,fnow) + end + zcompress!(compressed, a2, c) +end + +function zuncompress!(data, compressed, c, f) + data2 = zuncompress(compressed, c, desttype(last(f))) + a2 = foldr(f, init = data2) do fnow, anow + zdecode(anow, fnow) + end + copyto!(data, a2) +end + +# ## `NoCompressor` +# The default and most minimal implementation of a compressor follows here, which does +# no actual compression. This is a good reference implementation for other compressors. + +""" + NoCompressor() + +Creates an object that can be passed to ZArray constructors without compression. +""" +struct NoCompressor <: Compressor end + +function zuncompress(a, ::NoCompressor, T) + _reinterpret(T,a) +end + +function zcompress(a, ::NoCompressor) + _reinterpret(UInt8,a) +end + +JSON.lower(::NoCompressor) = nothing + +compressortypes[nothing] = NoCompressor \ No newline at end of file diff --git a/src/Compressors/blosc.jl b/src/Compressors/blosc.jl new file mode 100644 index 0000000..789a298 --- /dev/null +++ b/src/Compressors/blosc.jl @@ -0,0 +1,70 @@ +#= +# Blosc compression + +This file implements a Blosc compressor via Blosc.jl. +=# + +import Blosc + +struct BloscCompressor <: Compressor + blocksize::Int + clevel::Int + cname::String + shuffle::Int +end + +""" + BloscCompressor(;blocksize=0, clevel=5, cname="lz4", shuffle=1) + +Returns a `BloscCompressor` struct that can serve as a Zarr array compressor. Keyword arguments are: + +* `clevel=5` the compression level, number between 0 (no compression) and 9 (max compression) +* `cname="lz4"` compressor name, can be one of `"blosclz"`, `"lz4"`, and `"lz4hc"` +* `shuffle=1` Either NOSHUFFLE (0), SHUFFLE (1), BITSHUFFLE (2) or AUTOSHUFFLE (-1). + If AUTOSHUFFLE, bit-shuffle will be used for buffers with itemsize 1, and byte-shuffle will be used otherwise. The default is SHUFFLE. +""" +BloscCompressor(;blocksize=0, clevel=5, cname="lz4", shuffle=1) = + BloscCompressor(blocksize, clevel, cname, shuffle) + +function getCompressor(::Type{BloscCompressor}, d::Dict) + BloscCompressor(d["blocksize"], d["clevel"], d["cname"], d["shuffle"]) +end + +zuncompress(a, ::BloscCompressor, T) = Blosc.decompress(Base.nonmissingtype(T), a) + +function zuncompress!(data::DenseArray, compressed, ::BloscCompressor) + Blosc.decompress!(vec(data),compressed) + # if Int(pointer(data,length(data))-pointer(data)) != (length(data)-1)*sizeof(eltype(data)) + # @show size(data) + # @show size(parent(data)) + # @show typeof(data) + # @show Int(pointer(data,length(data))-pointer(data)) + # @show (length(data)-1)*sizeof(eltype(data)) + # error("Something is wrong") + # end + # Zarr.Blosc.blosc_decompress(data, compressed, sizeof(data)) +end + + +function zcompress(a, c::BloscCompressor) + itemsize = sizeof(eltype(a)) + shuffle = c.shuffle + # Weird auto shuffle logic from + # https://github.com/zarr-developers/numcodecs/blob/7d8f9762b4f0f9b5e135688b2eeb3f783f90f208/numcodecs/blosc.pyx#L264-L272 + if shuffle == -1 + if itemsize == 1 + shuffle = Blosc.BITSHUFFLE + else + shuffle = Blosc.SHUFFLE + end + elseif shuffle ∉ (Blosc.NOSHUFFLE, Blosc.SHUFFLE, Blosc.BITSHUFFLE) + throw(ArgumentError("invalid shuffle argument; expected -1, 0, 1 or 2, found $shuffle")) + end + Blosc.set_compressor(c.cname) + Blosc.compress(a; level=c.clevel, shuffle=shuffle) +end + +JSON.lower(c::BloscCompressor) = Dict("id"=>"blosc", "cname"=>c.cname, + "clevel"=>c.clevel, "shuffle"=>c.shuffle, "blocksize"=>c.blocksize) + +Zarr.compressortypes["blosc"] = BloscCompressor \ No newline at end of file diff --git a/src/Compressors/zlib.jl b/src/Compressors/zlib.jl new file mode 100644 index 0000000..3e4067f --- /dev/null +++ b/src/Compressors/zlib.jl @@ -0,0 +1,38 @@ +#= +# Zlib compression + +This file implements a Zlib compressor via CodecZlib.jl. + +=# + +import CodecZlib + +""" + ZlibCompressor(clevel=-1) +Returns a `ZlibCompressor` struct that can serve as a Zarr array compressor. Keyword arguments are: +* `clevel=-1` the compression level, number between -1 (Default), 0 (no compression) and 9 (max compression) +* default is -1 compromise between speed and compression (currently equivalent to level 6). +""" +struct ZlibCompressor <: Compressor + clevel::Int +end + +ZlibCompressor(;clevel=-1) = ZlibCompressor(clevel) + +function getCompressor(::Type{ZlibCompressor}, d::Dict) + ZlibCompressor(d["level"]) +end + +function zuncompress(a, ::ZlibCompressor, T) + result = transcode(CodecZlib.ZlibDecompressor,a) + _reinterpret(Base.nonmissingtype(T),result) +end + +function zcompress(a, ::ZlibCompressor) + a_uint8 = _reinterpret(UInt8,a)[:] + transcode(CodecZlib.ZlibCompressor, a_uint8) +end + +JSON.lower(z::ZlibCompressor) = Dict("id"=>"zlib", "level" => z.clevel) + +Zarr.compressortypes["zlib"] = ZlibCompressor \ No newline at end of file diff --git a/src/Zarr.jl b/src/Zarr.jl index 5f58e61..ecc221a 100644 --- a/src/Zarr.jl +++ b/src/Zarr.jl @@ -4,7 +4,7 @@ import JSON import Blosc include("metadata.jl") -include("Compressors.jl") +include("Compressors/Compressors.jl") include("Storage/Storage.jl") include("Filters.jl") include("ZArray.jl")