-
Notifications
You must be signed in to change notification settings - Fork 32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
zstd error
when trying to read file produced by Polars
#173
Comments
In fact, Polars code (mostly same as above)import polars as pl
df = pl.DataFrame({
'DateTime': [
'2018-02-01 00:00:00', '2018-02-02 00:00:00',
'2018-02-03 00:00:00', '2018-02-04 00:00:00',
'2018-02-05 00:00:00'
],
'String': ['a', 'b', 'c', 'd', 'e'],
'Float': [1.2, 3.4, 5.6, 7.8, 9.0],
}).select(
pl.col('DateTime').str.strptime(pl.Datetime, '%Y-%m-%d %H:%M:%S'),
pl.exclude('DateTime')
)
print(df)
for algo in 'lz4 uncompressed snappy gzip brotli zstd'.split():
df.write_parquet(f"test.{algo}.parquet", compression=algo)
print(algo) This uses all available compression algorithms (except Julia code
julia> read_parquet("./test.lz4.parquet") |> DataFrame
ERROR: Unknown compression codec for column chunk: 7
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] (::Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64})()
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:189
[3] (::Parquet.var"#40#42"{Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64}, Parquet.PageLRU, Tuple{Parquet.PAR2.ColumnChunk, Int64}})()
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:34
[4] lock(f::Parquet.var"#40#42"{Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64}, Parquet.PageLRU, Tuple{Parquet.PAR2.ColumnChunk, Int64}}, l::ReentrantLock)
@ Base ./lock.jl:229
[5] cacheget
@ ~/.julia/packages/Parquet/6tj1X/src/reader.jl:30 [inlined]
[6] iterate(ccp::Parquet.ColumnChunkPages, startpos::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:167
[7] iterate(ccpv::Parquet.ColumnChunkPageValues{Int64}, startpos::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:262
[8] iterate
@ ~/.julia/packages/Parquet/6tj1X/src/reader.jl:240 [inlined]
[9] setrow(cursor::Parquet.ColCursor{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:114
[10] Parquet.ColCursor(par::Parquet.File, colname::Vector{String}; rows::UnitRange{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:62
[11] ColCursor
@ ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:56 [inlined]
[12] #59
@ ./none:0 [inlined]
[13] iterate
@ ./generator.jl:47 [inlined]
[14] collect(itr::Base.Generator{Vector{Vector{String}}, Parquet.var"#59#61"{UnitRange{Int64}, Parquet.File}})
@ Base ./array.jl:782
[15] BatchedColumnsCursor(par::Parquet.File; rows::UnitRange{Int64}, batchsize::Int64, reusebuffer::Bool, use_threads::Bool)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:254
[16] cursor(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:130
[17] load(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:134
[18] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:186 [inlined]
[19] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:184 [inlined]
[20] fromcolumns(x::Parquet.Table, names::Vector{Symbol}; copycols::Bool)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36
[21] fromcolumns
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36 [inlined]
[22] #fromcolumns#879
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:45 [inlined]
[23] DataFrame(x::Parquet.Table; copycols::Nothing)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:59
[24] DataFrame
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:48 [inlined]
[25] |>(x::Parquet.Table, f::Type{DataFrame})
@ Base ./operators.jl:907
[26] top-level scope
@ REPL[2]:1
julia> read_parquet("./test.uncompressed.parquet") |> DataFrame
ERROR: BoundsError: attempt to access 42-element Vector{UInt8} at index [4538]
Stacktrace:
[1] getindex
@ ./essentials.jl:13 [inlined]
[2] _read_fixed
@ ~/.julia/packages/Parquet/6tj1X/src/codec.jl:105 [inlined]
[3] read_fixed
@ ~/.julia/packages/Parquet/6tj1X/src/codec.jl:97 [inlined]
[4] read_plain_values(inp::Parquet.InputState, out::Parquet.OutputState{Int64}, count::Int32)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/codec.jl:216
[5] iterate(ccpv::Parquet.ColumnChunkPageValues{Int64}, startpos::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:284
[6] iterate
@ ~/.julia/packages/Parquet/6tj1X/src/reader.jl:240 [inlined]
[7] setrow(cursor::Parquet.ColCursor{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:114
[8] Parquet.ColCursor(par::Parquet.File, colname::Vector{String}; rows::UnitRange{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:62
[9] ColCursor
@ ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:56 [inlined]
[10] #59
@ ./none:0 [inlined]
[11] iterate
@ ./generator.jl:47 [inlined]
[12] collect(itr::Base.Generator{Vector{Vector{String}}, Parquet.var"#59#61"{UnitRange{Int64}, Parquet.File}})
@ Base ./array.jl:782
[13] BatchedColumnsCursor(par::Parquet.File; rows::UnitRange{Int64}, batchsize::Int64, reusebuffer::Bool, use_threads::Bool)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:254
[14] cursor(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:130
[15] load(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:134
[16] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:186 [inlined]
[17] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:184 [inlined]
[18] fromcolumns(x::Parquet.Table, names::Vector{Symbol}; copycols::Bool)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36
[19] fromcolumns
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36 [inlined]
[20] #fromcolumns#879
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:45 [inlined]
[21] DataFrame(x::Parquet.Table; copycols::Nothing)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:59
[22] DataFrame
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:48 [inlined]
[23] |>(x::Parquet.Table, f::Type{DataFrame})
@ Base ./operators.jl:907
[24] top-level scope
@ REPL[3]:1
julia> read_parquet("./test.snappy.parquet") |> DataFrame
ERROR: BoundsError: attempt to access 42-element Vector{UInt8} at index [43]
Stacktrace:
[1] getindex
@ ./essentials.jl:13 [inlined]
[2] _read_fixed
@ ~/.julia/packages/Parquet/6tj1X/src/codec.jl:105 [inlined]
[3] read_rle_run(inp::Parquet.InputState, out::Parquet.OutputState{Int32}, count::Int64, bits::UInt8, byt::Int64, read_type::Type{UInt32})
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/codec.jl:321
[4] read_hybrid(inp::Parquet.InputState, out::Parquet.OutputState{Int32}, count::Int32, bits::UInt8, byt::Int64; read_len::Bool)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/codec.jl:311
[5] read_hybrid
@ ~/.julia/packages/Parquet/6tj1X/src/codec.jl:283 [inlined]
[6] read_levels(inp::Parquet.InputState, out::Parquet.OutputState{Int32}, max_val::Int64, enc::Int32, num_values::Int32)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:314
[7] read_levels_and_nmissing(inp::Parquet.InputState, defn_out::Parquet.OutputState{Int32}, repn_out::Parquet.OutputState{Int32}, defn_enc::Int32, repn_enc::Int32, max_defn::Int64, max_repn::Int64, num_values::Int32)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:333
[8] iterate(ccpv::Parquet.ColumnChunkPageValues{Int64}, startpos::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:275
[9] iterate
@ ~/.julia/packages/Parquet/6tj1X/src/reader.jl:240 [inlined]
[10] setrow(cursor::Parquet.ColCursor{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:114
[11] Parquet.ColCursor(par::Parquet.File, colname::Vector{String}; rows::UnitRange{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:62
[12] ColCursor
@ ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:56 [inlined]
[13] #59
@ ./none:0 [inlined]
[14] iterate
@ ./generator.jl:47 [inlined]
[15] collect(itr::Base.Generator{Vector{Vector{String}}, Parquet.var"#59#61"{UnitRange{Int64}, Parquet.File}})
@ Base ./array.jl:782
[16] BatchedColumnsCursor(par::Parquet.File; rows::UnitRange{Int64}, batchsize::Int64, reusebuffer::Bool, use_threads::Bool)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:254
[17] cursor(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:130
[18] load(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:134
[19] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:186 [inlined]
[20] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:184 [inlined]
[21] fromcolumns(x::Parquet.Table, names::Vector{Symbol}; copycols::Bool)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36
[22] fromcolumns
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36 [inlined]
[23] #fromcolumns#879
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:45 [inlined]
[24] DataFrame(x::Parquet.Table; copycols::Nothing)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:59
[25] DataFrame
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:48 [inlined]
[26] |>(x::Parquet.Table, f::Type{DataFrame})
@ Base ./operators.jl:907
[27] top-level scope
@ REPL[4]:1
julia> read_parquet("./test.gzip.parquet") |> DataFrame
ERROR: zlib error: incorrect header check (code: -3)
Stacktrace:
[1] changemode!(stream::TranscodingStreams.TranscodingStream{CodecZlib.GzipDecompressor, IOBuffer}, newmode::Symbol)
@ TranscodingStreams ~/.julia/packages/TranscodingStreams/5yQuA/src/stream.jl:742
[2] callprocess(stream::TranscodingStreams.TranscodingStream{CodecZlib.GzipDecompressor, IOBuffer}, inbuf::TranscodingStreams.Buffer, outbuf::TranscodingStreams.Buffer)
@ TranscodingStreams ~/.julia/packages/TranscodingStreams/5yQuA/src/stream.jl:668
[3] fillbuffer(stream::TranscodingStreams.TranscodingStream{CodecZlib.GzipDecompressor, IOBuffer}; eager::Bool)
@ TranscodingStreams ~/.julia/packages/TranscodingStreams/5yQuA/src/stream.jl:596
[4] fillbuffer
@ ~/.julia/packages/TranscodingStreams/5yQuA/src/stream.jl:582 [inlined]
[5] eof
@ ~/.julia/packages/TranscodingStreams/5yQuA/src/stream.jl:201 [inlined]
[6] readbytes!(stream::TranscodingStreams.TranscodingStream{CodecZlib.GzipDecompressor, IOBuffer}, b::Vector{UInt8}, nb::Int64)
@ TranscodingStreams ~/.julia/packages/TranscodingStreams/5yQuA/src/stream.jl:387
[7] readbytes!
@ ~/.julia/packages/TranscodingStreams/5yQuA/src/stream.jl:384 [inlined]
[8] (::Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64})()
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:185
[9] (::Parquet.var"#40#42"{Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64}, Parquet.PageLRU, Tuple{Parquet.PAR2.ColumnChunk, Int64}})()
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:34
[10] lock(f::Parquet.var"#40#42"{Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64}, Parquet.PageLRU, Tuple{Parquet.PAR2.ColumnChunk, Int64}}, l::ReentrantLock)
@ Base ./lock.jl:229
[11] cacheget
@ ~/.julia/packages/Parquet/6tj1X/src/reader.jl:30 [inlined]
[12] iterate(ccp::Parquet.ColumnChunkPages, startpos::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:167
[13] iterate(ccpv::Parquet.ColumnChunkPageValues{Int64}, startpos::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:262
[14] iterate
@ ~/.julia/packages/Parquet/6tj1X/src/reader.jl:240 [inlined]
[15] setrow(cursor::Parquet.ColCursor{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:114
[16] Parquet.ColCursor(par::Parquet.File, colname::Vector{String}; rows::UnitRange{Int64}, row::Int64)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:62
[17] ColCursor
@ ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:56 [inlined]
[18] #59
@ ./none:0 [inlined]
[19] iterate
@ ./generator.jl:47 [inlined]
[20] collect(itr::Base.Generator{Vector{Vector{String}}, Parquet.var"#59#61"{UnitRange{Int64}, Parquet.File}})
@ Base ./array.jl:782
[21] BatchedColumnsCursor(par::Parquet.File; rows::UnitRange{Int64}, batchsize::Int64, reusebuffer::Bool, use_threads::Bool)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/cursor.jl:254
[22] cursor(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:130
[23] load(table::Parquet.Table)
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:134
[24] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:186 [inlined]
[25] getcolumn
@ ~/.julia/packages/Parquet/6tj1X/src/simple_reader.jl:184 [inlined]
[26] fromcolumns(x::Parquet.Table, names::Vector{Symbol}; copycols::Bool)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36
[27] fromcolumns
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:36 [inlined]
[28] #fromcolumns#879
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:45 [inlined]
[29] DataFrame(x::Parquet.Table; copycols::Nothing)
@ DataFrames ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:59
[30] DataFrame
@ ~/.julia/packages/DataFrames/LteEl/src/other/tables.jl:48 [inlined]
[31] |>(x::Parquet.Table, f::Type{DataFrame})
@ Base ./operators.jl:907
[32] top-level scope
@ REPL[5]:1
julia> read_parquet("./test.brotli.parquet") |> DataFrame
ERROR: Unknown compression codec for column chunk: 4
Stacktrace:
[1] error(s::String)
@ Base ./error.jl:35
[2] (::Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64})()
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:189
[3] (::Parquet.var"#40#42"{Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64}, Parquet.PageLRU, Tuple{Parquet.PAR2.ColumnChunk, Int64}})()
@ Parquet ~/.julia/packages/Parquet/6tj1X/src/reader.jl:34
[4] lock(f::Parquet.var"#40#42"{Parquet.var"#46#47"{Parquet.ColumnChunkPages, Int64}, Parquet.PageLRU, Tuple{Parquet.PAR2.ColumnChunk, Int64}}, l::ReentrantLock)
@ Base ./lock.jl:229
...
|
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Polars Python code
Note that
compression='zstd'
is the default.Generated Parquet file: test.parquet.zip.
Parquet.jl code
Parquet.jl error
Versions
The text was updated successfully, but these errors were encountered: