From 0676c4e0f71829158112abb99120258701075f26 Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Sun, 3 May 2020 04:09:54 -0400 Subject: [PATCH] Increase default buffer size (#34) * Increase default buffer size Makes creating a tarball and compressing it with `zstdmt` about 6x faster (30s vs 5s). Raw `tar` is still about 20% faster, but we'd probably need https://github.com/JuliaIO/Tar.jl/issues/33 to make up the difference. * Buffer for extract also * 1.3 compat --- src/Tar.jl | 3 ++ src/create.jl | 37 ++++++++++++++--------- src/extract.jl | 80 +++++++++++++++++++++++++------------------------- 3 files changed, 66 insertions(+), 54 deletions(-) diff --git a/src/Tar.jl b/src/Tar.jl index 6efbb25..fbb76c2 100644 --- a/src/Tar.jl +++ b/src/Tar.jl @@ -8,6 +8,9 @@ function Base.skip(io::Union{Base.Process, Base.ProcessChain}, n::Integer) end end +# 2 MiB to take advantage of THP if enabled +const DEFAULT_BUFFER_SIZE = 2 * 1024 * 1024 + include("header.jl") include("create.jl") include("extract.jl") diff --git a/src/create.jl b/src/create.jl index 136155a..aead9ba 100644 --- a/src/create.jl +++ b/src/create.jl @@ -9,7 +9,7 @@ function write_tarball( out::IO, sys_path::String, # path in the filesystem tar_path::String = ""; # path in the tarball - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) w = 0 st = lstat(sys_path) @@ -55,7 +55,7 @@ function write_tarball( out::IO, sys_path::String, tar_path::String = ""; - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) write_tarball(p->true, out, sys_path, tar_path, buf=buf) end @@ -63,7 +63,7 @@ end function write_header( out::IO, hdr::Header; - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) # extract values path = hdr.path @@ -111,7 +111,7 @@ function write_extended_header( out::IO, metadata::Vector{Pair{String,String}}; type::Symbol = :x, # default: non-global extended header - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) type in (:x, :g) || throw(ArgumentError("invalid type flag for extended header: $(repr(type))")) @@ -140,7 +140,7 @@ function write_standard_header( hdr::Header; name::AbstractString = "", prefix::AbstractString = "", - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) name = String(name) prefix = String(prefix) @@ -169,8 +169,8 @@ function write_standard_header( throw(ArgumentError("non-ASCII type flag value: $(repr(type))")) # construct header block - resize!(buf, 512) - h = IOBuffer(fill!(buf, 0x00), write=true, truncate=false) + header_view = view(buf, 1:512) + h = IOBuffer(fill!(header_view, 0x00), write=true, truncate=false) write(h, name) # name seek(h, 100) write(h, "$m \0") # mode @@ -204,14 +204,14 @@ function write_standard_header( write(h, prefix) # prefix # fix the checksum - c = string(sum(buf), base=8, pad=6) + c = string(sum(header_view), base=8, pad=6) @assert ncodeunits(c) ≤ 6 seek(h, 148) write(h, "$c\0 ") @assert position(h) == 156 # write header - w = write(out, buf) + w = write(out, header_view) @assert w == 512 return w end @@ -220,14 +220,23 @@ function write_data( tar::IO, file::IO; size::Integer, - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) - resize!(buf, 512) w = s = 0 + @assert sizeof(buf) % 512 == 0 while !eof(file) s += n = readbytes!(file, buf) - n < 512 && (buf[n+1:512] .= 0) - w += write(tar, buf) + if n < sizeof(buf) + r = n % 512 + if r != 0 + pad = n - r + 512 + buf[n+1:pad] .= 0 + n = pad + end + w += write(tar, view(buf, 1:n)) + else + w += write(tar, buf) + end end s == size || error(""" data did not have the expected size: @@ -242,7 +251,7 @@ function write_data( tar::IO, file::String; size::Integer, - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) open(file) do file′ write_data(tar, file′, size=size, buf=buf) diff --git a/src/extract.jl b/src/extract.jl index 8d8e7f4..b68fc2f 100644 --- a/src/extract.jl +++ b/src/extract.jl @@ -1,8 +1,14 @@ +@static if VERSION < v"1.4.0-DEV" + view_read!(io, buf::SubArray{UInt8}) = readbytes!(io, buf, sizeof(buf)) +else + view_read!(io, buf::SubArray{UInt8}) = read!(io, buf) +end + function list_tarball( tar::IO; raw::Bool = false, strict::Bool = !raw, - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) raw && strict && error("`raw=true` and `strict=true` options are incompatible") @@ -22,7 +28,7 @@ function extract_tarball( predicate::Function, tarball::AbstractString, root::String; - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) open(tarball) do tar extract_tarball(predicate, tar, root, buf=buf) @@ -33,7 +39,7 @@ function extract_tarball( predicate::Function, tar::IO, root::String; - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) links = Set{String}() while !eof(tar) @@ -117,7 +123,7 @@ const IGNORED_EXTENDED_LOCAL_HEADERS = [ "uname", ] -function read_header(io::IO; buf::Vector{UInt8} = Vector{UInt8}(undef, 512)) +function read_header(io::IO; buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE)) hdr = read_standard_header(io, buf=buf) hdr === nothing && return nothing size = path = link = nothing @@ -167,7 +173,7 @@ using Base.Checked: mul_with_overflow, add_with_overflow function read_extended_metadata( io::IO, size::Integer; - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) n = readbytes!(io, buf, size) n < size && "premature end of tar file" @@ -207,30 +213,26 @@ function read_extended_metadata( return metadata end -function read_standard_header(io::IO; buf::Vector{UInt8} = Vector{UInt8}(undef, 512)) - resize!(buf, 512) - read!(io, buf) - all(iszero, buf) && return nothing - n = length(buf) - n == 0 && error("premature end of tar file") - n < 512 && error("incomplete trailing block with length $n < 512") - @assert n == 512 - name = read_header_str(buf, 0, 100) - mode = read_header_int(buf, 100, 8) - size = buf[124+1] & 0x80 == 0 ? - read_header_int(buf, 124, 12) : - read_header_bin(buf, 124, 12) - chksum = read_header_int(buf, 148, 8) - type = read_header_chr(buf, 156) - link = read_header_str(buf, 157, 100) - magic = read_header_str(buf, 257, 6) - version = read_header_str(buf, 263, 2) - prefix = read_header_str(buf, 345, 155) +function read_standard_header(io::IO; buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE)) + header_view = view(buf, 1:512) + view_read!(io, header_view) + all(iszero, header_view) && return nothing + name = read_header_str(header_view, 0, 100) + mode = read_header_int(header_view, 100, 8) + size = header_view[124+1] & 0x80 == 0 ? + read_header_int(header_view, 124, 12) : + read_header_bin(header_view, 124, 12) + chksum = read_header_int(header_view, 148, 8) + type = read_header_chr(header_view, 156) + link = read_header_str(header_view, 157, 100) + magic = read_header_str(header_view, 257, 6) + version = read_header_str(header_view, 263, 2) + prefix = read_header_str(header_view, 345, 155) # check various fields - buf[index_range(148, 8)] .= ' ' # fill checksum field with spaces - buf_sum = sum(buf) + header_view[index_range(148, 8)] .= ' ' # fill checksum field with spaces + buf_sum = sum(header_view) chksum == buf_sum || - error("incorrect header checksum = $chksum; should be $buf_sum\n$(repr(String(buf)))") + error("incorrect header checksum = $chksum; should be $buf_sum\n$(repr(String(header_view)))") occursin(r"^ustar\s*$", magic) || error("unknown magic string for tar file: $(repr(magic))") occursin(r"^0* *$", version) || @@ -239,15 +241,16 @@ function read_standard_header(io::IO; buf::Vector{UInt8} = Vector{UInt8}(undef, return Header(path, to_symbolic_type(type), mode, size, link) end +round_up(size) = 512 * ((size + 511) ÷ 512) function skip_data(tar::IO, size::Integer) - skip(tar, 512 * ((size + 511) ÷ 512)) + skip(tar, round_up(size)) end index_range(offset::Int, length::Int) = offset .+ (1:length) -read_header_chr(buf::Vector{UInt8}, offset::Int) = Char(buf[offset+1]) +read_header_chr(buf::AbstractVector{UInt8}, offset::Int) = Char(buf[offset+1]) -function read_header_str(buf::Vector{UInt8}, offset::Int, length::Int) +function read_header_str(buf::AbstractVector{UInt8}, offset::Int, length::Int) r = index_range(offset, length) for i in r byte = buf[i] @@ -256,7 +259,7 @@ function read_header_str(buf::Vector{UInt8}, offset::Int, length::Int) return String(buf[r]) end -function read_header_int(buf::Vector{UInt8}, offset::Int, length::Int) +function read_header_int(buf::AbstractVector{UInt8}, offset::Int, length::Int) n = UInt64(0) for i in index_range(offset, length) byte = buf[i] @@ -269,7 +272,7 @@ function read_header_int(buf::Vector{UInt8}, offset::Int, length::Int) return n end -function read_header_bin(buf::Vector{UInt8}, offset::Int, length::Int) +function read_header_bin(buf::AbstractVector{UInt8}, offset::Int, length::Int) n = UInt64(0) for i in index_range(offset, length) n <<= 8 @@ -282,16 +285,13 @@ function read_data( tar::IO, file::IO; size::Integer, - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), )::Nothing - resize!(buf, 512) while size > 0 - r = readbytes!(tar, buf) + r = readbytes!(tar, buf, size < sizeof(buf) ? round_up(size) : sizeof(buf)) r < 512 && eof(io) && error("premature end of tar file") - size < 512 && resize!(buf, size) - size -= write(file, buf) + size -= write(file, view(buf, 1:min(r, size))) end - resize!(buf, 512) @assert size == 0 return end @@ -300,7 +300,7 @@ function read_data( tar::IO, file::String; size::Integer, - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), )::Nothing open(file, write=true) do file′ read_data(tar, file′, size=size, buf=buf) @@ -310,7 +310,7 @@ end function read_data( tar::IO; size::Integer, - buf::Vector{UInt8} = Vector{UInt8}(undef, 512), + buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), )::String io = IOBuffer(sizehint=size) read_data(tar, io, size=size, buf=buf)