Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce compression #11

Merged
merged 11 commits into from
May 31, 2019
1 change: 1 addition & 0 deletions .appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ platform:
matrix:
allow_failures:
- julia_version: nightly
- platform: x86 # see https://github.com/invenia/JLSO.jl/issues/12
branches:
only:
- master
Expand Down
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ authors = ["Rory Finnegan <[email protected]>"]

[deps]
BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Memento = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
Expand Down
24 changes: 16 additions & 8 deletions src/JLSO.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ Example)
```
Dict(
"metadata" => Dict(
"version" => v"1.0",
"julia" => v"0.6.4",
"format" => :bson, # Could also be :serialize
"version" => v"2.0",
"julia" => v"1.0.4",
"format" => :bson, # Could also be :julia_serialize
"compression" => :gzip_fastest, # could also be: :none, :gzip_smallest, or :gzip
"image" => "xxxxxxxxxxxx.dkr.ecr.us-east-1.amazonaws.com/myrepository:latest"
"pkgs" => Dict(
"AxisArrays" => v"0.2.1",
Expand All @@ -26,24 +27,31 @@ Dict(
),
)
```
WARNING: The serialized objects are stored using julia's builtin serialization format which
is not intended for long term storage. As a result, we're storing the serialized object data
WARNING: Regardless of serialization `format`, the serialized objects can not be deserialized
into structures with different fields, or if the types have been renamed or removed from the
packages.
Further, the `:julia_serialize` format is not intended for long term storage and is not
portable across julia versions. As a result, we're storing the serialized object data
in a json file which should also be able to load the docker image and versioninfo to allow
reconstruction.
"""
module JLSO

using BSON
using CodecZlib
using Serialization
using Memento
using Pkg
using Pkg: Pkg
using Pkg.Types: semver_spec

export JLSOFile

const LOGGER = getlogger(@__MODULE__)
const VALID_VERSIONS = (v"1.0", v"2.0")
const READABLE_VERSIONS = semver_spec("1, 2")
const WRITEABLE_VERSIONS = semver_spec("2")

const LOGGER = getlogger(@__MODULE__)
__init__() = Memento.register(LOGGER)

include("JLSOFile.jl")
include("file_io.jl")
include("metadata.jl")
Expand Down
29 changes: 21 additions & 8 deletions src/JLSOFile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@ struct JLSOFile
version::VersionNumber
julia::VersionNumber
format::Symbol
compression::Symbol
image::String
pkgs::Dict{String, VersionNumber}
objects::Dict{String, Vector{UInt8}}
end

"""
JLSOFile(data; image="", julia=$VERSION, version=v"1.0, format=:serialize)
JLSOFile(data; format=:julia_serialize, compression=:gzip, kwargs...)

Stores the information needed to write a .jlso file.

Expand All @@ -20,22 +21,32 @@ Stores the information needed to write a .jlso file.

- `image=""` - The docker image URI that was used to generate the file
- `julia=$VERSION` - The julia version used to write the file
- `version=v"1.0"` - The file schema version
- `format=:serialize` - The format to use for serializing individual objects. While `:bson` is
recommended for longer term object storage, `:serialize` tends to be the faster choice
- `version=v"2.0"` - The file schema version
- `format=:julia_serialize` - The format to use for serializing individual objects. While `:bson` is
recommended for longer term object storage, `:julia_serialize` tends to be the faster choice
for adhoc serialization.
- `compression=:gzip`, what form of compression to apply to the objects.
Use :none, to not compress. :gzip_fastest for the fastest gzip compression,
:gzip_smallest for the most compact (but slowest), or :gzip for a generally good compromize.
Due to the time taken for disk IO, :none is not normally as fast as using some compression.
"""
function JLSOFile(
data::Dict{String, <:Any};
version=v"1.0",
version=v"2.0.0",
julia=VERSION,
format=:serialize,
format=:julia_serialize,
compression=:gzip,
image=_image(),
)
_versioncheck(version)
if format === :serialize
# Deprecation warning
@warn "The `:serialize` format has been renamed to `:julia_serialize`."
format = :julia_serialize
end

_versioncheck(version, WRITEABLE_VERSIONS)
objects = Dict{String, Vector{UInt8}}()
jlso = JLSOFile(version, julia, format, image, _pkgs(), objects)
jlso = JLSOFile(version, julia, format, compression, image, _pkgs(), objects)

for (key, val) in data
jlso[key] = val
Expand All @@ -54,6 +65,7 @@ function Base.show(io::IO, jlso::JLSOFile)
"version=v\"$(jlso.version)\"",
"julia=v\"$(jlso.julia)\"",
"format=:$(jlso.format)",
"compression=:$(jlso.compression)",
"image=\"$(jlso.image)\"",
],
", "
Expand All @@ -69,6 +81,7 @@ function Base.:(==)(a::JLSOFile, b::JLSOFile)
a.image == b.image &&
a.pkgs == b.pkgs &&
a.format == b.format &&
a.compression == b.compression &&
a.objects == b.objects
)
end
Expand Down
18 changes: 18 additions & 0 deletions src/file_io.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
# that is done lazily and the code for that is in serialization.jl

function Base.write(io::IO, jlso::JLSOFile)
_versioncheck(jlso.version, WRITEABLE_VERSIONS)
bson(
io,
Dict(
"metadata" => Dict(
"version" => jlso.version,
"julia" => jlso.julia,
"format" => jlso.format,
"compression" => jlso.compression,
"image" => jlso.image,
"pkgs" => jlso.pkgs,
),
Expand All @@ -22,16 +24,32 @@ end
# they will be `deserialized` when they are indexed out of the returned JSLOFile object.
function Base.read(io::IO, ::Type{JLSOFile})
d = BSON.load(io)
_versioncheck(d["metadata"]["version"], READABLE_VERSIONS)
upgrade_jlso!(d)
return JLSOFile(
d["metadata"]["version"],
d["metadata"]["julia"],
d["metadata"]["format"],
d["metadata"]["compression"],
d["metadata"]["image"],
d["metadata"]["pkgs"],
d["objects"],
)
end

function upgrade_jlso!(raw_dict::AbstractDict)
metadata = raw_dict["metadata"]
if metadata["version"] ∈ semver_spec("1")
if metadata["format"] == :serialize
oxinabox marked this conversation as resolved.
Show resolved Hide resolved
metadata["format"] = :julia_serialize
end
metadata["compression"] = :none
metadata["version"] = v"2"
end
return raw_dict
end


"""
save(io, data)
save(path, data)
Expand Down
6 changes: 3 additions & 3 deletions src/metadata.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
function _versioncheck(version::VersionNumber)
supported = first(VALID_VERSIONS) <= version < last(VALID_VERSIONS)
function _versioncheck(version::VersionNumber, valid_versions)
supported = version ∈ valid_versions
supported || error(LOGGER, ArgumentError(
string(
"Unsupported version ($version). ",
"Expected a value between ($VALID_VERSIONS)."
"Expected a value between ($valid_versions)."
)
))
end
Expand Down
67 changes: 50 additions & 17 deletions src/serialization.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,57 @@
# This is the code that hands the serialiation and deserialization of each object

struct Formatter{S} end
deserialize(format::Symbol, io) = deserialize(Formatter{format}(), io)
serialize(format::Symbol, io, value) = serialize(Formatter{format}(), io, value)

deserialize(::Formatter{:bson}, io) = first(values(BSON.load(io)))
serialize(::Formatter{:bson}, io, value) = bson(io, Dict("object" => value))

deserialize(::Formatter{:julia_serialize}, io) = Serialization.deserialize(io)
serialize(::Formatter{:julia_serialize}, io, value) = Serialization.serialize(io, value)

struct Compressor{S} end
compress(compression::Symbol, io) = compress(Compressor{compression}(), io)
decompress(compression::Symbol, io) = decompress(Compressor{compression}(), io)

compress(::Compressor{:none}, io) = io
decompress(::Compressor{:none}, io) = io

compress(::Compressor{:gzip}, io) = GzipCompressorStream(io)
decompress(::Compressor{:gzip}, io) = GzipDecompressorStream(io)

compress(::Compressor{:gzip_fastest}, io) = GzipCompressorStream(io; level=1)
decompress(::Compressor{:gzip_fastest}, io) = GzipDecompressorStream(io)

compress(::Compressor{:gzip_smallest}, io) = GzipCompressorStream(io; level=9)
decompress(::Compressor{:gzip_smallest}, io) = GzipDecompressorStream(io)


"""
complete_compression(compressing_buffer)
Writes any end of compression sequence to the compressing buffer;
but does not close the underlying stream.
The compressing_buffer itself should not be used after this operation
"""
complete_compression(::Any) = nothing
function complete_compression(compressing_buffer::CodecZlib.TranscodingStream)
# need to close `compressing_buffer` so any compression can write end of body stuffs.
# But can't use normal `close` without closing `buffer` as well
# see https://github.com/bicycle1885/TranscodingStreams.jl/issues/85
CodecZlib.TranscodingStreams.changemode!(compressing_buffer, :close)
end


"""
getindex(jlso, name)

Returns the deserialized object with the specified name.
"""
function Base.getindex(jlso::JLSOFile, name::String)
try
if jlso.format === :bson
BSON.load(IOBuffer(jlso.objects[name]))[name]
elseif jlso.format === :serialize
deserialize(IOBuffer(jlso.objects[name]))
else
error(LOGGER, ArgumentError("Unsupported format $(jlso.format)"))
end
buffer = IOBuffer(jlso.objects[name])
decompressing_buffer = decompress(jlso.compression, buffer)
return deserialize(jlso.format, decompressing_buffer)
catch e
warn(LOGGER, e)
return jlso.objects[name]
Expand All @@ -26,15 +64,10 @@ end
Adds the object to the file and serializes it.
"""
function Base.setindex!(jlso::JLSOFile, value, name::String)
io = IOBuffer()

if jlso.format === :bson
bson(io, Dict(name => value))
elseif jlso.format === :serialize
serialize(io, value)
else
error(LOGGER, ArgumentError("Unsupported format $(jlso.format)"))
end
buffer = IOBuffer()
compressing_buffer = compress(jlso.compression, buffer)
serialize(jlso.format, compressing_buffer, value)
complete_compression(compressing_buffer)

jlso.objects[name] = take!(io)
jlso.objects[name] = take!(buffer)
end
1 change: 1 addition & 0 deletions test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
demo
12 changes: 6 additions & 6 deletions test/JLSOFile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
# Reset the cached image for future tests
JLSO._CACHE[:IMAGE] = ""

@testset "$fmt - $k" for fmt in (:bson, :serialize), (k, v) in datas
jlso = JLSOFile(k => v; format=fmt)
@testset "$(fmt): $k" for fmt in (:bson, :julia_serialize), (k, v) in datas
jlso = JLSOFile(k => v; format=fmt, compression=:none)
io = IOBuffer()
bytes = fmt === :bson ? bson(io, Dict(k => v)) : serialize(io, v)
bytes = fmt === :bson ? bson(io, Dict("object" => v)) : serialize(io, v)
expected = take!(io)

@test jlso.objects[k] == expected
Expand All @@ -21,16 +21,16 @@ end
@testset "unknown format" begin
@test_throws(
LOGGER,
ArgumentError,
MethodError,
JLSOFile("String" => "Hello World!", format=:unknown)
)
end

@testset "show" begin
jlso = JLSOFile(datas["String"])
expected = string(
"JLSOFile([data]; version=v\"1.0.0\", julia=v\"$VERSION\", ",
"format=:serialize, image=\"\")"
"JLSOFile([data]; version=v\"2.0.0\", julia=v\"$VERSION\", ",
"format=:julia_serialize, image=\"\")"
)
@test sprint(show, jlso) == sprint(print, jlso)
end
34 changes: 34 additions & 0 deletions test/backwards_compat.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
@testset "upgrade_jlso" begin
@testset "no change for current version" begin
d = Dict("metadata" => Dict("version" => v"2.0"))
upgrade_jlso!(d)
@test d == Dict("metadata" => Dict("version" => v"2.0"))
end

@testset "upgrade 1.0" begin
d = Dict("metadata" => Dict("version" => v"1.0", "format"=>:serialize))
upgrade_jlso!(d)
@test d == Dict("metadata" => Dict(
"version" => v"2.0", "format"=>:julia_serialize, "compression" => :none
))

@testset "Don't rename bson format" begin
d = Dict("metadata" => Dict("version" => v"1.0", "format"=>:bson))
upgrade_jlso!(d)
@test d == Dict("metadata" => Dict(
"version" => v"2.0", "format"=>:bson, "compression" => :none
))
end
end
end

# The below is how we saves the specimens for compat testing
# JLSO.save("specimens/v1_serialize.jlso", datas; format=:serialize)
# JLSO.save("specimens/v1_bson.jlso", datas; format=:bson)

@testset "Can still load old files" begin
dir = joinpath(@__DIR__, "specimens")
@testset "$fn" for fn in readdir(dir)
@test JLSO.load(joinpath(dir, fn)) == datas
end
end
Loading