From ba5105002f05fa288123af640a8f8d1729341064 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 13 May 2020 13:54:02 +1000 Subject: [PATCH 01/52] before merge with remote master --- Project.toml | 14 +++++++++++++- src/Parquet.jl | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 8469635..2630be1 100644 --- a/Project.toml +++ b/Project.toml @@ -2,21 +2,33 @@ name = "Parquet" uuid = "626c502c-15b0-58ad-a749-f091afb673ae" keywords = ["parquet", "julia", "columnar-storage"] license = "MIT" -desc = "Julia implementation of parquet columnar file format reader" +desc = "Julia implementation of parquet columnar file format reader and writer" version = "0.3.2" [deps] +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" +CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" +ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429" Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22" [compat] +CategoricalArrays = "0.6,0.7,0.8" CodecZlib = "0.5,0.6,0.7" +CodecZstd = "0.7" +DataAPI = "1" +LittleEndianBase128 = "0.3" MemPool = "0.2" +ProgressMeter = "1" ProtoBuf = "0.7,0.8" Snappy = "0.3" +Tables = "1" Thrift = "0.6" julia = "1" diff --git a/src/Parquet.jl b/src/Parquet.jl index 160faf6..f86b458 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -13,6 +13,7 @@ export is_par_file, ParFile, show, nrows, ncols, rowgroups, columns, pages, byte export SchemaConverter, schema, JuliaConverter, ThriftConverter, ProtoConverter export RowCursor, ColCursor, RecCursor export AbstractBuilder, JuliaBuilder +export write_parquet # package code goes here include("PAR2/PAR2.jl") @@ -22,5 +23,6 @@ include("schema.jl") include("reader.jl") include("cursor.jl") include("show.jl") +include("writer.jl") end # module From 39323df31e4abcc713a6b150733b9a842bfaf30f Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 13 May 2020 14:09:37 +1000 Subject: [PATCH 02/52] adding tests --- Project.toml | 2 +- test/test_writer.jl | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 1436e3b..9564631 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2" DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" @@ -17,7 +18,6 @@ ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429" Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22" -Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" [compat] CategoricalArrays = "0.6,0.7,0.8" diff --git a/test/test_writer.jl b/test/test_writer.jl index 8e55412..23805e1 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -4,7 +4,7 @@ using Random:randstring tbl = ( int32 = Int32.(1:1000), - int64 = Int32.(1:1000), + int64 = Int64.(1:1000), float32 = Float32.(1:1000), float64 = Float64.(1:1000), bool = rand(Bool, 1000), @@ -14,10 +14,17 @@ tbl = ( float32m = rand([missing, Float32.(1:100)...], 1000), float64m = rand([missing, Float64.(1:100)...], 1000), boolm = rand([missing, true, false], 1000), + stringm = rand([missing, "abc", "def", "ghi"], 1000) ) write_parquet("tmp.parquet", tbl) -ParFile("tmp.parquet") +pf = ParFile("tmp.parquet") +col_chunks = columns(pf, 1) +vals = values.(Ref(pf), Ref(col_chunks), 1:length(col_chunks)) + +vals = values(pf, col_chunks, 5) +vals = values(pf, col_chunks, 6) +vals = values(pf, col_chunks, 7) rm("tmp.parquet") From e41a113d150145f89bbfcdced4917d047f4715f2 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 13 May 2020 14:36:46 +1000 Subject: [PATCH 03/52] added tests for wrtier --- src/reader.jl | 7 +++-- test/test_writer.jl | 77 ++++++++++++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 30 deletions(-) diff --git a/src/reader.jl b/src/reader.jl index 518d054..f86f44b 100644 --- a/src/reader.jl +++ b/src/reader.jl @@ -1,4 +1,3 @@ - const PAR_MAGIC = "PAR1" const SZ_PAR_MAGIC = length(PAR_MAGIC) const SZ_FOOTER = 4 @@ -58,6 +57,10 @@ function ParFile(path::AbstractString, handle::IOStream; maxcache::Integer=10) ParFile(path, handle, meta, Schema(meta.schema), PageLRU()) end +function Base.close(par::ParFile) + close(par.handle) +end + ## # layer 1 access # can access raw (uncompressed) bytes from pages @@ -371,7 +374,7 @@ function is_par_file(io) magic = Array{UInt8}(undef, 4) read!(io, magic) (String(magic) == PAR_MAGIC) || return false - + seek(io, sz - SZ_PAR_MAGIC) magic = Array{UInt8}(undef, 4) read!(io, magic) diff --git a/test/test_writer.jl b/test/test_writer.jl index 23805e1..ffd9c95 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -1,30 +1,51 @@ using Parquet using Test -using Random:randstring - -tbl = ( - int32 = Int32.(1:1000), - int64 = Int64.(1:1000), - float32 = Float32.(1:1000), - float64 = Float64.(1:1000), - bool = rand(Bool, 1000), - string = [randstring(8) for i in 1:1000], - int32m = rand([missing, 1:100...], 1000), - int64m = rand([missing, 1:100...], 1000), - float32m = rand([missing, Float32.(1:100)...], 1000), - float64m = rand([missing, Float64.(1:100)...], 1000), - boolm = rand([missing, true, false], 1000), - stringm = rand([missing, "abc", "def", "ghi"], 1000) -) - -write_parquet("tmp.parquet", tbl) - -pf = ParFile("tmp.parquet") -col_chunks = columns(pf, 1) -vals = values.(Ref(pf), Ref(col_chunks), 1:length(col_chunks)) - -vals = values(pf, col_chunks, 5) -vals = values(pf, col_chunks, 6) -vals = values(pf, col_chunks, 7) - -rm("tmp.parquet") +using Random + +Random.seed!(1234567) + +function test_write() + tbl = ( + int32 = Int32.(1:1000), + int64 = Int64.(1:1000), + float32 = Float32.(1:1000), + float64 = Float64.(1:1000), + bool = rand(Bool, 1000), + string = [randstring(8) for i in 1:1000], + int32m = rand([missing, 1:100...], 1000), + int64m = rand([missing, 1:100...], 1000), + float32m = rand([missing, Float32.(1:100)...], 1000), + float64m = rand([missing, Float64.(1:100)...], 1000), + boolm = rand([missing, true, false], 1000), + stringm = rand([missing, "abc", "def", "ghi"], 1000) + ) + + write_parquet("tmp_plsdel.parquet", tbl) + + pf = ParFile("tmp_plsdel.parquet") + + # the file is very smalll so only one rowgroup + col_chunks = columns(pf, 1) + + for colnum in 1:length(col_chunks) + correct_vals = tbl[colnum] + coltype = eltype(correct_vals) + vals_from_file = values(pf, col_chunks, colnum) + if Missing <: coltype + @test ismissing.(correct_vals) == (vals_from_file[2] .== 0) + end + + if nonmissingtype(coltype) == String + @test all(skipmissing(correct_vals) .== String.(vals_from_file[1])) + else + @test all(skipmissing(correct_vals) .== vals_from_file[1]) + end + end + + # clean up + close(pf) + + #rm("tmp_plsdel.parquet") +end + +test_write() From be7fb944655f2d433e3b2b09a3331fb05cd9f164 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 13 May 2020 14:43:01 +1000 Subject: [PATCH 04/52] added readme for test write and used tempname() --- README.md | 29 ++++++++++++++++++++++++++++- test/test_writer.jl | 7 ++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7b0ecf4..90af020 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,8 @@ [![Build Status](https://travis-ci.org/JuliaIO/Parquet.jl.svg?branch=master)](https://travis-ci.org/JuliaIO/Parquet.jl) [![Build status](https://ci.appveyor.com/api/projects/status/vrqg01w2sj3mfk3d/branch/master?svg=true)](https://ci.appveyor.com/project/tanmaykm/parquet-jl/branch/master) +## Reader + Load a [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet). Only metadata is read initially, data is loaded in chunks on demand. (Note: [ParquetFiles.jl](https://github.com/queryverse/ParquetFiles.jl) also provides load support for Parquet files under the FileIO.jl package.) ```julia @@ -31,7 +33,7 @@ julia> colnames(p) 8-element Array{AbstractString,1}: "c_acctbal" "c_mktsegment" - "c_nationkey" + "c_nationkey" "c_name" "c_address" "c_custkey" @@ -140,3 +142,28 @@ julia> for v in values 04/01/09, 2009-04-01T12:01:00 ``` +## Writer + +You can write any Tables.jl column accessible tables that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64` + +### Writer Example + +```julia +tbl = ( + int32 = Int32.(1:1000), + int64 = Int64.(1:1000), + float32 = Float32.(1:1000), + float64 = Float64.(1:1000), + bool = rand(Bool, 1000), + string = [randstring(8) for i in 1:1000], + int32m = rand([missing, 1:100...], 1000), + int64m = rand([missing, 1:100...], 1000), + float32m = rand([missing, Float32.(1:100)...], 1000), + float64m = rand([missing, Float64.(1:100)...], 1000), + boolm = rand([missing, true, false], 1000), + stringm = rand([missing, "abc", "def", "ghi"], 1000) +) + +file = tempname()*".parquet" +write_parquet(file, tbl) +``` diff --git a/test/test_writer.jl b/test/test_writer.jl index ffd9c95..61592af 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -20,9 +20,10 @@ function test_write() stringm = rand([missing, "abc", "def", "ghi"], 1000) ) - write_parquet("tmp_plsdel.parquet", tbl) + tmpfile = tempname()*".parquet" + write_parquet(tmpfile, tbl) - pf = ParFile("tmp_plsdel.parquet") + pf = ParFile(tmpfile) # the file is very smalll so only one rowgroup col_chunks = columns(pf, 1) @@ -45,7 +46,7 @@ function test_write() # clean up close(pf) - #rm("tmp_plsdel.parquet") + #rm(tmpfile) end test_write() From 866323edb953a5e7c561f9ef176f57ddaf860dba Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 13 May 2020 14:44:35 +1000 Subject: [PATCH 05/52] fixed project.toml adding random --- Project.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 9564631..3701976 100644 --- a/Project.toml +++ b/Project.toml @@ -34,7 +34,8 @@ Thrift = "0.6" julia = "1" [extras] +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] +test = ["Test", "Random"] From f1e70c8415f265b53f717d0709860a8f0429eb5e Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 13 May 2020 14:56:02 +1000 Subject: [PATCH 06/52] added version to writer --- Project.toml | 1 + src/Parquet.jl | 3 +++ src/writer.jl | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 3701976..54f72cf 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429" Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" diff --git a/src/Parquet.jl b/src/Parquet.jl index 149da23..6a5ce50 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -7,6 +7,9 @@ using CodecZlib using MemPool using Dates +using Pkg +const PARQUET_JL_VERSION = VersionNumber(Pkg.TOML.parsefile(joinpath(@__DIR__, "..", "Project.toml"))["version"]) + import Base: show, open, close, values import Thrift: isfilled diff --git a/src/writer.jl b/src/writer.jl index 9d02561..2b1ff64 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -499,7 +499,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") Thrift.set_field!(filemetadata, :version, 1) Thrift.set_field!(filemetadata, :schema, schemas) Thrift.set_field!(filemetadata, :num_rows, nrows) - Thrift.set_field!(filemetadata, :created_by, "Parquet.jl") + Thrift.set_field!(filemetadata, :created_by, "Parquet.jl $(Parquet.PARQUET_JL_VERSION)") # create row_groups # TODO do multiple row_groups From 40cbfefb9e865bdd8620c4021b626bdde4eecd00 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 13 May 2020 21:19:47 +1000 Subject: [PATCH 07/52] added missing for Julia 1.0.5 --- Project.toml | 2 ++ src/Parquet.jl | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/Project.toml b/Project.toml index 39a804b..b2a04de 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429" @@ -27,6 +28,7 @@ CodecZstd = "0.6,0.7" DataAPI = "1" LittleEndianBase128 = "0.3" MemPool = "0.2" +Missings = "0.3,0.4" ProgressMeter = "1" ProtoBuf = "0.7,0.8" Snappy = "0.3" diff --git a/src/Parquet.jl b/src/Parquet.jl index ca2054a..748cc67 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -8,6 +8,10 @@ using CodecZstd using MemPool using Dates +if VERSION < v"1.3" + using Missings: nonmissingtype +end + using Pkg const PARQUET_JL_VERSION = VersionNumber(Pkg.TOML.parsefile(joinpath(@__DIR__, "..", "Project.toml"))["version"]) From be5e64cb2341aca9bd4da65ec80c11612e76c201 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Thu, 14 May 2020 23:46:35 +1000 Subject: [PATCH 08/52] removed progress meter --- Project.toml | 2 -- README.md | 4 +++- src/writer.jl | 3 +-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Project.toml b/Project.toml index b2a04de..be869a4 100644 --- a/Project.toml +++ b/Project.toml @@ -15,7 +15,6 @@ LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429" Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" @@ -29,7 +28,6 @@ DataAPI = "1" LittleEndianBase128 = "0.3" MemPool = "0.2" Missings = "0.3,0.4" -ProgressMeter = "1" ProtoBuf = "0.7,0.8" Snappy = "0.3" Tables = "1" diff --git a/README.md b/README.md index 90af020..6ac9e87 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,9 @@ julia> for v in values ## Writer -You can write any Tables.jl column accessible tables that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64` +You can write any Tables.jl column-accessible table that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64`. + +However, `CategoricalArray`s are not yet supported. Furthermore, these types are not yet supported: `Int96`, `Int128`, `Date`, and `DateTime`. ### Writer Example diff --git a/src/writer.jl b/src/writer.jl index 2b1ff64..4ad0464 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -8,7 +8,6 @@ using CodecZlib: GzipCompressor using LittleEndianBase128 using Base.Iterators: partition using CategoricalArrays: CategoricalArray, CategoricalValue -using ProgressMeter # a mapping of Julia types to _Type codes in Parquet format const COL_TYPE_CODE = Dict{DataType, Int32}( @@ -429,7 +428,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") recommended_chunks = 1 end - @showprogress for (coli, colname_sym) in enumerate(colnames) + for (coli, colname_sym) in enumerate(colnames) colvals = Tables.getcolumn(tbl, colname_sym) colname = String(colname_sym) From c97a06744157bbbb3dce86ae4757618fff96c174 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Thu, 14 May 2020 23:52:43 +1000 Subject: [PATCH 09/52] typo --- src/writer.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/writer.jl b/src/writer.jl index 4ad0464..b7e6b70 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -22,7 +22,7 @@ const COL_TYPE_CODE = Dict{DataType, Int32}( ) function write_thrift(fileio, thrift_obj) - """write thrift defiition to file""" + """write thrift definition to file""" p = TCompactProtocol(TFileTransport(fileio)) Thrift.write(p, thrift_obj) end From c97b56b8482c08c8011f9628b2eb787f87395fab Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Fri, 15 May 2020 00:35:46 +1000 Subject: [PATCH 10/52] fixed julia fail bug --- test/test_writer.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_writer.jl b/test/test_writer.jl index 61592af..dbee089 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -2,6 +2,10 @@ using Parquet using Test using Random +if VERSION < v"1.3" + using Missings: nonmissingtype +end + Random.seed!(1234567) function test_write() From 34ed20a3f457a07ae28bf81655a2a9a5d3534fd2 Mon Sep 17 00:00:00 2001 From: evalparse Date: Sat, 16 May 2020 12:09:18 +1000 Subject: [PATCH 11/52] Update src/writer.jl --- src/writer.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/writer.jl b/src/writer.jl index b7e6b70..8567e15 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -75,7 +75,7 @@ function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T # do not support dictionary with more than 127 levels # TODO relax this 127 restriction if length(uvals) > 127 - @warn "More than 127 levels in dictionary. This is not supported at this stage." + @warn "More than 127 levels in dictionary. Parquet.jl does not support this at this stage." return (offset = missing, uncompressed_size = 0, compressed_size = 0) end From 781ff7d1ae665ff8cedffd6c5f19a69929fbbc37 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 12:28:58 +1000 Subject: [PATCH 12/52] minor refactor --- src/writer.jl | 275 +++++++++++++++++++++++++------------------------- 1 file changed, 139 insertions(+), 136 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index b7e6b70..dfd2cb2 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -14,11 +14,11 @@ const COL_TYPE_CODE = Dict{DataType, Int32}( Bool => PAR2._Type.BOOLEAN, Int32 => PAR2._Type.INT32, Int64 => PAR2._Type.INT64, - #INT96 => 3, // deprecated, only used by legacy implementations. # not supported + #INT96 => 3, // deprecated, only used by legacy implementations. # not supported by Parquet.jl Float32 => PAR2._Type.FLOAT, Float64 => PAR2._Type.DOUBLE, String => PAR2._Type.BYTE_ARRAY, # BYTE_ARRAY - # FIXED_LEN_BYTE_ARRAY => 7, + # FIXED_LEN_BYTE_ARRAY => 7, # current there is no Julia type that we support that maps to this type ) function write_thrift(fileio, thrift_obj) @@ -120,167 +120,170 @@ function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T end # TODO set the encoding code into a dictionary -function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, encoding) where T - """write a chunk of data into a data page""" - if encoding == PAR2.Encoding.PLAIN - # generate the data page header - data_page_header = PAR2.PageHeader() - - # write repetition level data - # do nothing - # this seems to be related to nested columns - # and hence is not needed here - - # set up a buffer to write to - data_to_compress_io = IOBuffer() - - if Missing <: T - # if there is missing - # use the bit packing algorithm to write the - # definition_levels - - bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8)) - tmp = UInt32((UInt32(bytes_needed) << 1) | 1) - bitpacking_header = LittleEndianBase128.encode(tmp) - - tmpio = IOBuffer() - not_missing_bits::BitArray = .!ismissing.(colvals) - write(tmpio, not_missing_bits) - seek(tmpio, 0) - - encoded_defn_data = read(tmpio, bytes_needed) - - encoded_defn_data_length = length(bitpacking_header) + bytes_needed - # write the definition data - write(data_to_compress_io, UInt32(encoded_defn_data_length)) - write(data_to_compress_io, bitpacking_header) - write(data_to_compress_io, encoded_defn_data) - else - # if there is no missing can just use RLE of one - # using rle - rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) - repeated_value = UInt8(1) - encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value)) - - # write the definition data - write(data_to_compress_io, UInt32(encoded_defn_data_length)) - write(data_to_compress_io, rle_header) - write(data_to_compress_io, repeated_value) - end +function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.Encoding.PLAIN}) where T + """write a chunk of data into a data page using PLAIN encoding""" - if nonmissingtype(T) == String - # write the values - for val in skipmissing(colvals) - # for string it needs to be stored as BYTE_ARRAY which needs the length - # to be the first 4 bytes UInt32 - write(data_to_compress_io, val |> sizeof |> UInt32) - # write each of the strings one after another - write(data_to_compress_io, val) - end - elseif nonmissingtype(T) == Bool - # write the bitacpked bits - # write a bitarray seems to write 8 bytes at a time - # so write to a tmpio first - no_missing_bit_vec = BitArray(skipmissing(colvals)) - bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8)) - tmpio = IOBuffer() - write(tmpio, no_missing_bit_vec) - seek(tmpio, 0) - packed_bits = read(tmpio, bytes_needed) - write(data_to_compress_io, packed_bits) - else - for val in skipmissing(colvals) - write(data_to_compress_io, val) - end + # generate the data page header + data_page_header = PAR2.PageHeader() + + # write repetition level data + # do nothing + # this seems to be related to nested columns + # and hence is not needed here + + # set up a buffer to write to + data_to_compress_io = IOBuffer() + + if Missing <: T + # if there is missing + # use the bit packing algorithm to write the + # definition_levels + + bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8)) + tmp = UInt32((UInt32(bytes_needed) << 1) | 1) + bitpacking_header = LittleEndianBase128.encode(tmp) + + tmpio = IOBuffer() + not_missing_bits::BitArray = .!ismissing.(colvals) + write(tmpio, not_missing_bits) + seek(tmpio, 0) + + encoded_defn_data = read(tmpio, bytes_needed) + + encoded_defn_data_length = length(bitpacking_header) + bytes_needed + # write the definition data + write(data_to_compress_io, UInt32(encoded_defn_data_length)) + write(data_to_compress_io, bitpacking_header) + write(data_to_compress_io, encoded_defn_data) + else + # if there is no missing can just use RLE of one + # using rle + rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) + repeated_value = UInt8(1) + encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value)) + + # write the definition data + write(data_to_compress_io, UInt32(encoded_defn_data_length)) + write(data_to_compress_io, rle_header) + write(data_to_compress_io, repeated_value) + end + + if nonmissingtype(T) == String + # write the values + for val in skipmissing(colvals) + # for string it needs to be stored as BYTE_ARRAY which needs the length + # to be the first 4 bytes UInt32 + write(data_to_compress_io, val |> sizeof |> UInt32) + # write each of the strings one after another + write(data_to_compress_io, val) end + elseif nonmissingtype(T) == Bool + # write the bitacpked bits + # write a bitarray seems to write 8 bytes at a time + # so write to a tmpio first + no_missing_bit_vec = BitArray(skipmissing(colvals)) + bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8)) + tmpio = IOBuffer() + write(tmpio, no_missing_bit_vec) + seek(tmpio, 0) + packed_bits = read(tmpio, bytes_needed) + write(data_to_compress_io, packed_bits) + else + for val in skipmissing(colvals) + write(data_to_compress_io, val) + end + end - data_to_compress::Vector{UInt8} = take!(data_to_compress_io) + data_to_compress::Vector{UInt8} = take!(data_to_compress_io) - compressed_data::Vector{UInt8} = compress_using_codec(data_to_compress, codec) + compressed_data::Vector{UInt8} = compress_using_codec(data_to_compress, codec) - uncompressed_page_size = length(data_to_compress) - compressed_page_size = length(compressed_data) + uncompressed_page_size = length(data_to_compress) + compressed_page_size = length(compressed_data) - Thrift.set_field!(data_page_header, :_type, PAR2.PageType.DATA_PAGE) - Thrift.set_field!(data_page_header, :uncompressed_page_size, uncompressed_page_size) - Thrift.set_field!(data_page_header, :compressed_page_size, compressed_page_size) + Thrift.set_field!(data_page_header, :_type, PAR2.PageType.DATA_PAGE) + Thrift.set_field!(data_page_header, :uncompressed_page_size, uncompressed_page_size) + Thrift.set_field!(data_page_header, :compressed_page_size, compressed_page_size) - # TODO proper CRC - Thrift.set_field!(data_page_header, :crc , 0) + # TODO proper CRC + Thrift.set_field!(data_page_header, :crc , 0) - Thrift.set_field!(data_page_header, :data_page_header, PAR2.DataPageHeader()) - Thrift.set_field!(data_page_header.data_page_header, :num_values , Int32(length(colvals))) - Thrift.set_field!(data_page_header.data_page_header, :encoding , encoding) # encoding 0 is plain encoding - Thrift.set_field!(data_page_header.data_page_header, :definition_level_encoding, PAR2.Encoding.RLE) - Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE) + Thrift.set_field!(data_page_header, :data_page_header, PAR2.DataPageHeader()) + Thrift.set_field!(data_page_header.data_page_header, :num_values , Int32(length(colvals))) + Thrift.set_field!(data_page_header.data_page_header, :encoding , encoding) # encoding 0 is plain encoding + Thrift.set_field!(data_page_header.data_page_header, :definition_level_encoding, PAR2.Encoding.RLE) + Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE) - position_before_page_header_write = position(fileio) - write_thrift(fileio, data_page_header) - size_of_page_header_defn_repn = position(fileio) - position_before_page_header_write + position_before_page_header_write = position(fileio) + write_thrift(fileio, data_page_header) + size_of_page_header_defn_repn = position(fileio) - position_before_page_header_write - # write data - write(fileio, compressed_data) + # write data + write(fileio, compressed_data) - return ( - offset = position_before_page_header_write, - uncompressed_size = uncompressed_page_size + size_of_page_header_defn_repn, - compressed_size = compressed_page_size + size_of_page_header_defn_repn, - ) - elseif encoding == PAR2.Encoding.PLAIN_DICTIONARY - error("not implemented yet") - """Dictionary encoding""" - rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) - repeated_value = UInt8(1) + return ( + offset = position_before_page_header_write, + uncompressed_size = uncompressed_page_size + size_of_page_header_defn_repn, + compressed_size = compressed_page_size + size_of_page_header_defn_repn, + ) +end - encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value)) +function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val(PAR2.Encoding.PLAIN_DICTIONARY)) where T + error("PLAIN_DICTIONARY encoding not implemented yet") + """Dictionary encoding""" + rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) + repeated_value = UInt8(1) - ## write the encoded data length - write(fileio, encoded_defn_data_length) + encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value)) - write(fileio, rle_header) - write(fileio, repeated_value) + ## write the encoded data length + write(fileio, encoded_defn_data_length) - position(fileio) + write(fileio, rle_header) + write(fileio, repeated_value) - # write the data + position(fileio) - ## firstly, bit pack it - colvals + # write the data - # the bitwidth to use - bitwidth = ceil(UInt8, log(2, length(uvals))) - # the max bitwidth is 32 according to documentation - @assert bitwidth <= 32 - # to do that I have to figure out the Dictionary index of it - # build a JuliaDict - val_index_dict = Dict(zip(uvals, 1:length(uvals))) + ## firstly, bit pack it + colvals - bitwidth_mask = UInt32(2^bitwidth-1) + # the bitwidth to use + bitwidth = ceil(UInt8, log(2, length(uvals))) + # the max bitwidth is 32 according to documentation + @assert bitwidth <= 32 + # to do that I have to figure out the Dictionary index of it + # build a JuliaDict + val_index_dict = Dict(zip(uvals, 1:length(uvals))) - bytes_needed = ceil(Int, bitwidth*length(colvals) / 8) + bitwidth_mask = UInt32(2^bitwidth-1) - bit_packed_encoded_data = zeros(UInt8, bytes_needed) - upto_byte = 1 + bytes_needed = ceil(Int, bitwidth*length(colvals) / 8) - bits_written = 0 - bitsz = 8sizeof(UInt8) + bit_packed_encoded_data = zeros(UInt8, bytes_needed) + upto_byte = 1 - for val in colvals - bit_packed_val = UInt32(val_index_dict[val]) & bitwidth_mask - if bitwidth_mask <= bitsz - bits_written - bit_packed_encoded_data[upto_byte] = (bit_packed_encoded_data[upto_byte] << bitwidth_mask) | bit_packed_val - else - # this must mean - # bitwidth_mask > bitsz - bits_written - # if the remaining bits is not enough to write a packed number - 42 - end + bits_written = 0 + bitsz = 8sizeof(UInt8) + + for val in colvals + bit_packed_val = UInt32(val_index_dict[val]) & bitwidth_mask + if bitwidth_mask <= bitsz - bits_written + bit_packed_encoded_data[upto_byte] = (bit_packed_encoded_data[upto_byte] << bitwidth_mask) | bit_packed_val + else + # this must mean + # bitwidth_mask > bitsz - bits_written + # if the remaining bits is not enough to write a packed number + 42 end - else - error("Page encoding $encoding is yet not implemented.") end end +function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, encoding) where T + error("Page encoding $encoding is yet not implemented.") +end + write_col(fileio, colvals::CategoricalArray, args...; kwars...) = begin throw("Currently CategoricalArrays are not supported.") end From 620b0f907bcf8ae06bebe4ba8030fc179409b245 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 15:23:54 +1000 Subject: [PATCH 13/52] created a write encoded data and write definition functions --- src/writer.jl | 191 +++++++++++++++++++++++++------------------- test/test_writer.jl | 21 +++-- 2 files changed, 120 insertions(+), 92 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 730f890..39a78b0 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -9,6 +9,12 @@ using LittleEndianBase128 using Base.Iterators: partition using CategoricalArrays: CategoricalArray, CategoricalValue +if VERSION < v"1.3" + using Missings: SkipMissing +else + using Base: SkipMissing +end + # a mapping of Julia types to _Type codes in Parquet format const COL_TYPE_CODE = Dict{DataType, Int32}( Bool => PAR2._Type.BOOLEAN, @@ -67,6 +73,44 @@ function compress_using_codec(colvals::AbstractVector{String}, codec::Int)::Vect return compress_using_codec(uncompressed_bytes, codec) end +function write_defn_levels(data_to_compress_io, colvals::AbstractVector{Union{Missing, T}}) where T + """ A function to write definition levels for `Union{Missing, T}`""" + # if there is missing + # use the bit packing algorithm to write the + # definition_levels + bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8)) + tmp = UInt32((UInt32(bytes_needed) << 1) | 1) + bitpacking_header = LittleEndianBase128.encode(tmp) + + tmpio = IOBuffer() + not_missing_bits::BitArray = .!ismissing.(colvals) + write(tmpio, not_missing_bits) + seek(tmpio, 0) + + encoded_defn_data = read(tmpio, bytes_needed) + + encoded_defn_data_length = length(bitpacking_header) + bytes_needed + # write the definition data + write(data_to_compress_io, UInt32(encoded_defn_data_length)) + write(data_to_compress_io, bitpacking_header) + write(data_to_compress_io, encoded_defn_data) +end + +function write_defn_levels(data_to_compress_io, colvals::AbstractVector) + """ A function to write definition levels for NON-missing data + """ + # if there is no missing can just use RLE of one + # using rle + rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) + repeated_value = UInt8(1) + encoded_defn_data_length = sizeof(rle_header) + sizeof(repeated_value) + + # write the definition data + write(data_to_compress_io, UInt32(encoded_defn_data_length)) + write(data_to_compress_io, rle_header) + write(data_to_compress_io, repeated_value) +end + function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T """ write the column dictionary page """ # note: `level`s does not return `missing` as a level @@ -119,80 +163,69 @@ function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T return (offset = before_write_page_header_pos, uncompressed_size = uncompressed_dict_size + dict_page_header_size, compressed_size = compressed_dict_size + dict_page_header_size) end -# TODO set the encoding code into a dictionary -function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.Encoding.PLAIN}) where T - """write a chunk of data into a data page using PLAIN encoding""" - # generate the data page header - data_page_header = PAR2.PageHeader() +write_encoded_data(data_to_compress_io, colvals::AbstractVector{Union{Missing, T}}) where T = + write_encoded_data(data_to_compress_io, skipmissing(colvals)) - # write repetition level data - # do nothing - # this seems to be related to nested columns - # and hence is not needed here +function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{String}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, String}} + # write the values + for val in colvals + # for string it needs to be stored as BYTE_ARRAY which needs the length + # to be the first 4 bytes UInt32 + write(data_to_compress_io, val |> sizeof |> UInt32) + # write each of the strings one after another + write(data_to_compress_io, val) + end +end - # set up a buffer to write to - data_to_compress_io = IOBuffer() +function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{Bool}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, Bool}} + # write the bitacpked bits + # write a bitarray seems to write 8 bytes at a time + # so write to a tmpio first + no_missing_bit_vec = BitArray(colvals) + bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8)) + tmpio = IOBuffer() + write(tmpio, no_missing_bit_vec) + seek(tmpio, 0) + packed_bits = read(tmpio, bytes_needed) + write(data_to_compress_io, packed_bits) +end - if Missing <: T - # if there is missing - # use the bit packing algorithm to write the - # definition_levels +function write_encoded_data(data_to_compress_io, colvals::AbstractArray) + write(data_to_compress_io, val) +end - bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8)) - tmp = UInt32((UInt32(bytes_needed) << 1) | 1) - bitpacking_header = LittleEndianBase128.encode(tmp) +function write_encoded_data(data_to_compress_io, colvals::SkipMissing) + for val in colvals + write(data_to_compress_io, val) + end +end - tmpio = IOBuffer() - not_missing_bits::BitArray = .!ismissing.(colvals) - write(tmpio, not_missing_bits) - seek(tmpio, 0) +# TODO set the encoding code into a dictionary +function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN}) + """ + Write a chunk of data into a data page using PLAIN encoding where the values + are written back-to-back in memory and then compressed with the codec. + For `String`s, the values are written with length (UInt32), followed by + content; it is NOT null terminated. + """ - encoded_defn_data = read(tmpio, bytes_needed) + # generate the data page header + data_page_header = PAR2.PageHeader() - encoded_defn_data_length = length(bitpacking_header) + bytes_needed - # write the definition data - write(data_to_compress_io, UInt32(encoded_defn_data_length)) - write(data_to_compress_io, bitpacking_header) - write(data_to_compress_io, encoded_defn_data) - else - # if there is no missing can just use RLE of one - # using rle - rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) - repeated_value = UInt8(1) - encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value)) - - # write the definition data - write(data_to_compress_io, UInt32(encoded_defn_data_length)) - write(data_to_compress_io, rle_header) - write(data_to_compress_io, repeated_value) - end + # set up an IO buffer to write to + data_to_compress_io = IOBuffer() - if nonmissingtype(T) == String - # write the values - for val in skipmissing(colvals) - # for string it needs to be stored as BYTE_ARRAY which needs the length - # to be the first 4 bytes UInt32 - write(data_to_compress_io, val |> sizeof |> UInt32) - # write each of the strings one after another - write(data_to_compress_io, val) - end - elseif nonmissingtype(T) == Bool - # write the bitacpked bits - # write a bitarray seems to write 8 bytes at a time - # so write to a tmpio first - no_missing_bit_vec = BitArray(skipmissing(colvals)) - bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8)) - tmpio = IOBuffer() - write(tmpio, no_missing_bit_vec) - seek(tmpio, 0) - packed_bits = read(tmpio, bytes_needed) - write(data_to_compress_io, packed_bits) - else - for val in skipmissing(colvals) - write(data_to_compress_io, val) - end - end + # write repetition level data + ## do nothing + ## this seems to be related to nested columns + ## and hence is not needed here as we only supported unnested column write + + # write definition levels + write_defn_levels(data_to_compress_io, colvals) + + # write the encoded data + write_encoded_data(data_to_compress_io, colvals) data_to_compress::Vector{UInt8} = take!(data_to_compress_io) @@ -210,7 +243,7 @@ function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.En Thrift.set_field!(data_page_header, :data_page_header, PAR2.DataPageHeader()) Thrift.set_field!(data_page_header.data_page_header, :num_values , Int32(length(colvals))) - Thrift.set_field!(data_page_header.data_page_header, :encoding , encoding) # encoding 0 is plain encoding + Thrift.set_field!(data_page_header.data_page_header, :encoding , PAR2.Encoding.PLAIN) Thrift.set_field!(data_page_header.data_page_header, :definition_level_encoding, PAR2.Encoding.RLE) Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE) @@ -228,9 +261,11 @@ function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.En ) end -function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val(PAR2.Encoding.PLAIN_DICTIONARY)) where T +function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN_DICTIONARY}) + """write Dictionary encoding data page""" error("PLAIN_DICTIONARY encoding not implemented yet") - """Dictionary encoding""" + + # TODO finish the implementation rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) repeated_value = UInt8(1) @@ -247,7 +282,6 @@ function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val(PAR2.En # write the data ## firstly, bit pack it - colvals # the bitwidth to use bitwidth = ceil(UInt8, log(2, length(uvals))) @@ -291,23 +325,18 @@ end function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; num_chunks = 1) where T """Write a column to a file""" # TODO turn writing dictionary on - if false - if nonmissingtype(T) == Bool - # dictionary type are not supported for - dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0) - else - dict_info = write_col_dict(fileio, colvals, codec) - end - else - # return offset of -1 means that dict_info - dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0) - end + # Currently, writing the dictionary page is not turned on for any type. + # Normally, for Boolean data, dictionary is not supported. However for other + # data types, dictionary page CAN be supported. However, since Parquet.jl + # only supports writing PLAIN encoding data, hence there is no need to write + # a dictionary page until other dictionary-based encodings are supported + dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0) num_vals_per_chunk = ceil(Int, length(colvals) / num_chunks) # TODO choose an encoding # TODO put encoding into a dictionary - chunk_info = [write_col_chunk(fileio, val_chunk, codec, encoding) for val_chunk in partition(colvals, num_vals_per_chunk)] + chunk_info = [write_col_chunk(fileio, val_chunk, codec, Val(encoding)) for val_chunk in partition(colvals, num_vals_per_chunk)] sizes = reduce(chunk_info; init = dict_info) do x, y ( diff --git a/test/test_writer.jl b/test/test_writer.jl index dbee089..bfb93b7 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -10,26 +10,27 @@ Random.seed!(1234567) function test_write() tbl = ( - int32 = Int32.(1:1000), - int64 = Int64.(1:1000), - float32 = Float32.(1:1000), - float64 = Float64.(1:1000), + int32 = rand(Int32, 1000), + int64 = rand(Int64, 1000), + float32 = rand(Float32, 1000), + float64 = rand(Float64, 1000), bool = rand(Bool, 1000), string = [randstring(8) for i in 1:1000], - int32m = rand([missing, 1:100...], 1000), - int64m = rand([missing, 1:100...], 1000), - float32m = rand([missing, Float32.(1:100)...], 1000), - float64m = rand([missing, Float64.(1:100)...], 1000), + int32m = rand([missing, rand(Int32, 10)...], 1000), + int64m = rand([missing, rand(Int64, 10)...], 1000), + float32m = rand([missing, rand(Float32, 10)...], 1000), + float64m = rand([missing, rand(Float64, 10)...], 1000), boolm = rand([missing, true, false], 1000), stringm = rand([missing, "abc", "def", "ghi"], 1000) ) tmpfile = tempname()*".parquet" + write_parquet(tmpfile, tbl) pf = ParFile(tmpfile) - # the file is very smalll so only one rowgroup + # the file is very small so only one rowgroup col_chunks = columns(pf, 1) for colnum in 1:length(col_chunks) @@ -49,8 +50,6 @@ function test_write() # clean up close(pf) - - #rm(tmpfile) end test_write() From 53382e270021ccee7ef35ab77f394ed1378cec3f Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 15:26:56 +1000 Subject: [PATCH 14/52] minor bug fix --- src/writer.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/writer.jl b/src/writer.jl index 39a78b0..2104dec 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -192,7 +192,8 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{B end function write_encoded_data(data_to_compress_io, colvals::AbstractArray) - write(data_to_compress_io, val) + @assert isbitstype(eltype(colvals)) + write(data_to_compress_io, colvals) end function write_encoded_data(data_to_compress_io, colvals::SkipMissing) From 1bc2addd00aaa634f59894c3ff077af62d603758 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 19:03:41 +1000 Subject: [PATCH 15/52] fixed Julia 1.0.5 issue --- src/writer.jl | 6 +++--- test/test_writer.jl | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 2104dec..c5a99ea 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -9,10 +9,10 @@ using LittleEndianBase128 using Base.Iterators: partition using CategoricalArrays: CategoricalArray, CategoricalValue +using Base: SkipMissing + if VERSION < v"1.3" - using Missings: SkipMissing -else - using Base: SkipMissing + using Missings: nonmissingtype end # a mapping of Julia types to _Type codes in Parquet format diff --git a/test/test_writer.jl b/test/test_writer.jl index bfb93b7..24172f1 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -6,6 +6,8 @@ if VERSION < v"1.3" using Missings: nonmissingtype end +using Base.SkipMissing + Random.seed!(1234567) function test_write() From 5ebe14288e98fbe6daaa4fb33265c29450797433 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 19:04:24 +1000 Subject: [PATCH 16/52] minor bug fix --- test/test_writer.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_writer.jl b/test/test_writer.jl index 24172f1..1976d7c 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -6,7 +6,7 @@ if VERSION < v"1.3" using Missings: nonmissingtype end -using Base.SkipMissing +using Base: SkipMissing Random.seed!(1234567) From 1f02847025f1cd7feee039d6118ca6a2adf86b0f Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 19:06:00 +1000 Subject: [PATCH 17/52] removed minor --- test/test_writer.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/test_writer.jl b/test/test_writer.jl index 1976d7c..bfb93b7 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -6,8 +6,6 @@ if VERSION < v"1.3" using Missings: nonmissingtype end -using Base: SkipMissing - Random.seed!(1234567) function test_write() From 7652a87ddb61754887f77c9e6d4cabc50b0ddca1 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 19:13:19 +1000 Subject: [PATCH 18/52] most general form of write_encoded_data --- src/writer.jl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/writer.jl b/src/writer.jl index c5a99ea..f4aeadb 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -168,6 +168,7 @@ write_encoded_data(data_to_compress_io, colvals::AbstractVector{Union{Missing, T write_encoded_data(data_to_compress_io, skipmissing(colvals)) function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{String}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, String}} + """ Write encoded data for String type """ # write the values for val in colvals # for string it needs to be stored as BYTE_ARRAY which needs the length @@ -179,6 +180,7 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{S end function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{Bool}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, Bool}} + """ Write encoded data for Bool type """ # write the bitacpked bits # write a bitarray seems to write 8 bytes at a time # so write to a tmpio first @@ -192,16 +194,27 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{B end function write_encoded_data(data_to_compress_io, colvals::AbstractArray) + """ Efficient write of encoded data for `isbits` types""" @assert isbitstype(eltype(colvals)) write(data_to_compress_io, colvals) end function write_encoded_data(data_to_compress_io, colvals::SkipMissing) + """ Write of encoded data for skipped missing types""" for val in colvals write(data_to_compress_io, val) end end +function write_encoded_data(data_to_compress_io, colvals) + """ Write of encoded data for the most general type. + The only requirement is that colvals has to be iterable + """ + for val in skipmissing(colvals) + write(data_to_compress_io, val) + end +end + # TODO set the encoding code into a dictionary function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN}) """ From a4e3ffec546acb14f916947b9da1401cd4db0927 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 21:27:16 +1000 Subject: [PATCH 19/52] refactored into internal methods --- src/writer.jl | 160 ++++++++++++++++++++++++++++---------------------- 1 file changed, 90 insertions(+), 70 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index f4aeadb..73b8313 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -216,7 +216,7 @@ function write_encoded_data(data_to_compress_io, colvals) end # TODO set the encoding code into a dictionary -function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN}) +function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN}) """ Write a chunk of data into a data page using PLAIN encoding where the values are written back-to-back in memory and then compressed with the codec. @@ -275,7 +275,7 @@ function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encod ) end -function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN_DICTIONARY}) +function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN_DICTIONARY}) """write Dictionary encoding data page""" error("PLAIN_DICTIONARY encoding not implemented yet") @@ -328,7 +328,7 @@ function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encod end end -function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, encoding) where T +function write_col_page(fileio, colvals::AbstractArray{T}, codec, encoding) where T error("Page encoding $encoding is yet not implemented.") end @@ -336,7 +336,7 @@ write_col(fileio, colvals::CategoricalArray, args...; kwars...) = begin throw("Currently CategoricalArrays are not supported.") end -function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; num_chunks = 1) where T +function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; nchunks = 1) where T """Write a column to a file""" # TODO turn writing dictionary on # Currently, writing the dictionary page is not turned on for any type. @@ -346,11 +346,9 @@ function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; # a dictionary page until other dictionary-based encodings are supported dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0) - num_vals_per_chunk = ceil(Int, length(colvals) / num_chunks) + num_vals_per_chunk = ceil(Int, length(colvals) / nchunks) - # TODO choose an encoding - # TODO put encoding into a dictionary - chunk_info = [write_col_chunk(fileio, val_chunk, codec, Val(encoding)) for val_chunk in partition(colvals, num_vals_per_chunk)] + chunk_info = [write_col_page(fileio, val_chunk, codec, Val(encoding)) for val_chunk in partition(colvals, num_vals_per_chunk)] sizes = reduce(chunk_info; init = dict_info) do x, y ( @@ -359,13 +357,53 @@ function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; ) end + # write the column metadata + # can probably write the metadata right after the data chunks + col_meta = PAR2.ColumnMetaData() + + Thrift.set_field!(col_meta, :_type, COL_TYPE_CODE[eltype(colvals) |> nonmissingtype]) + # these are all the fields + # TODO collect all the encodings used + if eltype(colvals) == Bool + Thrift.set_field!(col_meta, :encodings, Int32[0, 3]) + else + Thrift.set_field!(col_meta, :encodings, Int32[2, 0, 3]) + end + Thrift.set_field!(col_meta, :path_in_schema, [colname]) + Thrift.set_field!(col_meta, :codec, codec) + Thrift.set_field!(col_meta, :num_values, length(colvals)) + + Thrift.set_field!(col_meta, :total_uncompressed_size, sizes.uncompressed_size) + Thrift.set_field!(col_meta, :total_compressed_size, sizes.compressed_size) + + Thrift.set_field!(col_meta, :data_page_offset, chunk_info[1].offset) + if !ismissing(dict_info.offset) + Thrift.set_field!(col_meta, :dictionary_page_offset, dict_info.offset) + end + + # write the column meta data right after the data + # keep track of the position so it can put into the column chunk + # metadata + col_meta_offset = position(fileio) + write_thrift(fileio, col_meta) + + # Prep metadata for the filemetadata + ## column chunk metadata + col_chunk_meta = PAR2.ColumnChunk() + + Thrift.set_field!(col_chunk_meta, :file_offset, col_meta_offset) + Thrift.set_field!(col_chunk_meta, :meta_data, col_meta) + Thrift.clear(col_chunk_meta, :offset_index_offset) + Thrift.clear(col_chunk_meta, :offset_index_length) + Thrift.clear(col_chunk_meta, :column_index_offset) + Thrift.clear(col_chunk_meta, :column_index_length) + return ( - dictionary_page_offset = dict_info.offset, data_page_offset = chunk_info[1].offset, - uncompressed_size = sizes.uncompressed_size, - compressed_size = sizes.compressed_size, + dictionary_page_offset = dict_info.offset, + col_chunk_meta = col_chunk_meta, + col_meta_offset = col_meta_offset ) - end function create_schema_parent_node(ncols) @@ -446,20 +484,6 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") # convert a string or symbol compression codec into the numeric code codec = getproperty(PAR2.CompressionCodec, Symbol(uppercase(string(compression_codec)))) - fileio = open(path, "w") - write(fileio, "PAR1") - - colnames = Tables.columnnames(tbl) - ncols = length(colnames) - nrows = length(Tables.rows(tbl)) - - # the + 1 comes from the fact that schema is tree and there is an extra - # parent node - schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1) - schemas[1] = create_schema_parent_node(ncols) - col_chunk_metas = Vector{PAR2.ColumnChunk}(undef, ncols) - row_group_file_offset = missing - # figure out the right number of chunks # TODO test that it works for all supported table table_size_bytes = Base.summarysize(tbl) @@ -467,6 +491,9 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") approx_raw_to_parquet_compression_ratio = 6 approx_post_compression_size = (table_size_bytes / 2^30) / approx_raw_to_parquet_compression_ratio + colnames = String.(Tables.columnnames(tbl)) + nrows = length(Tables.rows(tbl)) + # if size is larger than 64mb and has more than 6 rows if (approx_post_compression_size > 0.064) & (nrows > 6) recommended_chunks = ceil(Int, approx_post_compression_size / 6) * 6 @@ -474,12 +501,46 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") recommended_chunks = 1 end + _write_parquet( + tbl, + path, + recommended_chunks; + encoding = Dict(String(col)=>encoding for col in colnames), + codec = Dict(String(col)=>codec for col in colnames) + ) +end + +function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32}) + """Internal method for writing parquet + + tbl - Expected to be a Tables.jl compatible table + path - The output parquet file path + + """ + fileio = open(path, "w") + write(fileio, "PAR1") + + colnames = Tables.columnnames(tbl) + ncols = length(colnames) + nrows = length(Tables.rows(tbl)) + + # the + 1 comes from the fact that schema is tree and there is an extra + # parent node + schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1) + schemas[1] = create_schema_parent_node(ncols) + col_chunk_metas = Vector{PAR2.ColumnChunk}(undef, ncols) + row_group_file_offset = missing + + # write the columns one by one + # TODO parallelize this for (coli, colname_sym) in enumerate(colnames) colvals = Tables.getcolumn(tbl, colname_sym) colname = String(colname_sym) - # write the data - col_info = write_col(fileio, colvals, colname, encoding, codec; num_chunks = recommended_chunks) + col_encoding = encoding[colname] + col_codec = codec[colname] + # write the data including metadata + col_info = write_col(fileio, colvals, colname, col_encoding, col_codec; nchunks = nchunks) # the `row_group_file_offset` keeps track where the data # starts, so keep it at the dictonary of the first data @@ -491,48 +552,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") end end - # write the column metadata - # can probably write the metadata right after the data chunks - col_meta = PAR2.ColumnMetaData() - - Thrift.set_field!(col_meta, :_type, COL_TYPE_CODE[eltype(colvals) |> nonmissingtype]) - # these are all the fields - # TODO collect all the encodings used - if eltype(colvals) == Bool - Thrift.set_field!(col_meta, :encodings, Int32[0, 3]) - else - Thrift.set_field!(col_meta, :encodings, Int32[2, 0, 3]) - end - Thrift.set_field!(col_meta, :path_in_schema, [colname]) - Thrift.set_field!(col_meta, :codec, codec) - Thrift.set_field!(col_meta, :num_values, length(colvals)) - - Thrift.set_field!(col_meta, :total_uncompressed_size, col_info.uncompressed_size) - Thrift.set_field!(col_meta, :total_compressed_size, col_info.compressed_size) - - Thrift.set_field!(col_meta, :data_page_offset, col_info.data_page_offset) - if !ismissing(col_info.dictionary_page_offset) - Thrift.set_field!(col_meta, :dictionary_page_offset, col_info.dictionary_page_offset) - end - - # write the column meta data right after the data - # keep track of the position so it can put into the column chunk - # metadata - col_meta_offset = position(fileio) - write_thrift(fileio, col_meta) - - # Prep metadata for the filemetadata - ## column chunk metadata - col_chunk_meta = PAR2.ColumnChunk() - - Thrift.set_field!(col_chunk_meta, :file_offset, col_meta_offset) - Thrift.set_field!(col_chunk_meta, :meta_data, col_meta) - Thrift.clear(col_chunk_meta, :offset_index_offset) - Thrift.clear(col_chunk_meta, :offset_index_length) - Thrift.clear(col_chunk_meta, :column_index_offset) - Thrift.clear(col_chunk_meta, :column_index_length) - - col_chunk_metas[coli] = col_chunk_meta + col_chunk_metas[coli] = col_info.col_chunk_meta # add the schema schemas[coli + 1] = create_col_schema(eltype(colvals) |> nonmissingtype, colname) From 06fb6996bbdab20ff8b11f6f3546d0e231447b18 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 21:29:09 +1000 Subject: [PATCH 20/52] minor for clarity --- src/writer.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 73b8313..a9b48c1 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -491,16 +491,15 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") approx_raw_to_parquet_compression_ratio = 6 approx_post_compression_size = (table_size_bytes / 2^30) / approx_raw_to_parquet_compression_ratio - colnames = String.(Tables.columnnames(tbl)) - nrows = length(Tables.rows(tbl)) - # if size is larger than 64mb and has more than 6 rows + nrows = length(Tables.rows(tbl)) if (approx_post_compression_size > 0.064) & (nrows > 6) recommended_chunks = ceil(Int, approx_post_compression_size / 6) * 6 else recommended_chunks = 1 end + colnames = String.(Tables.columnnames(tbl)) _write_parquet( tbl, path, From c0bd4d0f2cdc602e8aaf83af8ef54174387a9eac Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 16 May 2020 21:32:24 +1000 Subject: [PATCH 21/52] minor update --- src/writer.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index a9b48c1..6aac875 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -504,8 +504,8 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") tbl, path, recommended_chunks; - encoding = Dict(String(col)=>encoding for col in colnames), - codec = Dict(String(col)=>codec for col in colnames) + encoding = Dict(col => encoding for col in colnames), + codec = Dict(col => codec for col in colnames) ) end @@ -514,6 +514,9 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec tbl - Expected to be a Tables.jl compatible table path - The output parquet file path + nchunks - The number of chunks/pages to write the columns + encoding - A dictionary mapping from column names to encoding + codec - A dictionary mapping from column names to compressoin codec """ fileio = open(path, "w") From 774bb4c3384d26f6123167e2c3a735a1a5f0ce24 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 18 May 2020 11:38:30 +1000 Subject: [PATCH 22/52] fixed all comments --- src/reader.jl | 4 -- src/writer.jl | 152 +++++++++++++++++++++++++------------------------- 2 files changed, 77 insertions(+), 79 deletions(-) diff --git a/src/reader.jl b/src/reader.jl index 9c94622..9150e05 100644 --- a/src/reader.jl +++ b/src/reader.jl @@ -76,10 +76,6 @@ function close(par::ParFile) close(par.handle) end -function Base.close(par::ParFile) - close(par.handle) -end - ## # layer 1 access # can access raw (uncompressed) bytes from pages diff --git a/src/writer.jl b/src/writer.jl index 6aac875..3d0ce5b 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -29,8 +29,14 @@ const COL_TYPE_CODE = Dict{DataType, Int32}( function write_thrift(fileio, thrift_obj) """write thrift definition to file""" + pos_before_write = position(fileio) p = TCompactProtocol(TFileTransport(fileio)) Thrift.write(p, thrift_obj) + pos_after_write = position(fileio) + + size_of_written = pos_after_write - pos_before_write + + size_of_written end function compress_using_codec(colvals::AbstractArray, codec::Integer)::Vector{UInt8} @@ -111,57 +117,56 @@ function write_defn_levels(data_to_compress_io, colvals::AbstractVector) write(data_to_compress_io, repeated_value) end -function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T - """ write the column dictionary page """ - # note: `level`s does not return `missing` as a level - uvals = DataAPI.levels(colvals) - - # do not support dictionary with more than 127 levels - # TODO relax this 127 restriction - if length(uvals) > 127 - @warn "More than 127 levels in dictionary. Parquet.jl does not support this at this stage." - return (offset = missing, uncompressed_size = 0, compressed_size = 0) - end - - if nonmissingtype(T) == String - # the raw bytes of made of on UInt32 to indicate string length - # and the content of the string - # so the formula for dict size is as below - uncompressed_dict_size = sizeof(UInt32)*length(uvals) + sum(sizeof, uvals) - else - uncompressed_dict_size = length(uvals)*sizeof(eltype(uvals)) - end - - compressed_uvals::Vector{UInt8} = compress_using_codec(uvals, codec) - compressed_dict_size = length(compressed_uvals) - - # TODO do the CRC properly - crc = 0 - - # construct dictionary metadata - dict_page_header = PAR2.PageHeader() - - Thrift.set_field!(dict_page_header, :_type, PAR2.PageType.DICTIONARY_PAGE) - Thrift.set_field!(dict_page_header, :uncompressed_page_size , uncompressed_dict_size) - Thrift.set_field!(dict_page_header, :compressed_page_size , compressed_dict_size) - Thrift.set_field!(dict_page_header, :crc , crc) - - Thrift.set_field!(dict_page_header, :dictionary_page_header, PAR2.DictionaryPageHeader()) - Thrift.set_field!(dict_page_header.dictionary_page_header, :num_values , Int32(length(uvals))) - Thrift.set_field!(dict_page_header.dictionary_page_header, :encoding , PAR2.Encoding.PLAIN_DICTIONARY) - Thrift.set_field!(dict_page_header.dictionary_page_header, :is_sorted , false) - - before_write_page_header_pos = position(fileio) - - write_thrift(fileio, dict_page_header) - - dict_page_header_size = position(fileio) - before_write_page_header_pos - - # write the dictionary data - write(fileio, compressed_uvals) - - return (offset = before_write_page_header_pos, uncompressed_size = uncompressed_dict_size + dict_page_header_size, compressed_size = compressed_dict_size + dict_page_header_size) -end +# TODO turn this on when writing dictionary is necessary +# function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T +# """ write the column dictionary page """ +# # note: `level`s does not return `missing` as a level +# uvals = DataAPI.levels(colvals) +# +# # do not support dictionary with more than 127 levels +# # TODO relax this 127 restriction +# if length(uvals) > 127 +# @warn "More than 127 levels in dictionary. Parquet.jl does not support this at this stage." +# return (offset = missing, uncompressed_size = 0, compressed_size = 0) +# end +# +# if nonmissingtype(T) == String +# # the raw bytes of made of on UInt32 to indicate string length +# # and the content of the string +# # so the formula for dict size is as below +# uncompressed_dict_size = sizeof(UInt32)*length(uvals) + sum(sizeof, uvals) +# else +# uncompressed_dict_size = length(uvals)*sizeof(eltype(uvals)) +# end +# +# compressed_uvals::Vector{UInt8} = compress_using_codec(uvals, codec) +# compressed_dict_size = length(compressed_uvals) +# +# # TODO do the CRC properly +# crc = 0 +# +# # construct dictionary metadata +# dict_page_header = PAR2.PageHeader() +# +# Thrift.set_field!(dict_page_header, :_type, PAR2.PageType.DICTIONARY_PAGE) +# Thrift.set_field!(dict_page_header, :uncompressed_page_size , uncompressed_dict_size) +# Thrift.set_field!(dict_page_header, :compressed_page_size , compressed_dict_size) +# Thrift.set_field!(dict_page_header, :crc , crc) +# +# Thrift.set_field!(dict_page_header, :dictionary_page_header, PAR2.DictionaryPageHeader()) +# Thrift.set_field!(dict_page_header.dictionary_page_header, :num_values , Int32(length(uvals))) +# Thrift.set_field!(dict_page_header.dictionary_page_header, :encoding , PAR2.Encoding.PLAIN_DICTIONARY) +# Thrift.set_field!(dict_page_header.dictionary_page_header, :is_sorted , false) +# +# before_write_page_header_pos = position(fileio) +# +# dict_page_header_size = write_thrift(fileio, dict_page_header) +# +# # write the dictionary data +# write(fileio, compressed_uvals) +# +# return (offset = before_write_page_header_pos, uncompressed_size = uncompressed_dict_size + dict_page_header_size, compressed_size = compressed_dict_size + dict_page_header_size) +# end write_encoded_data(data_to_compress_io, colvals::AbstractVector{Union{Missing, T}}) where T = @@ -262,8 +267,8 @@ function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encodi Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE) position_before_page_header_write = position(fileio) - write_thrift(fileio, data_page_header) - size_of_page_header_defn_repn = position(fileio) - position_before_page_header_write + + size_of_page_header_defn_repn = write_thrift(fileio, data_page_header) # write data write(fileio, compressed_data) @@ -501,7 +506,8 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") colnames = String.(Tables.columnnames(tbl)) _write_parquet( - tbl, + Tables.columns(tbl), + Tables.columnnames(tbl), path, recommended_chunks; encoding = Dict(col => encoding for col in colnames), @@ -509,24 +515,23 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") ) end -function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32}) +function _write_parquet(itr_vectors, colnames, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32}) """Internal method for writing parquet - tbl - Expected to be a Tables.jl compatible table - path - The output parquet file path - nchunks - The number of chunks/pages to write the columns - encoding - A dictionary mapping from column names to encoding - codec - A dictionary mapping from column names to compressoin codec - + itr_vectors - An iterable of `AbstractVector`s containing the values to be + written + colnames - Column names for each of the vectors + path - The output parquet file path + nchunks - The number of chunks/pages to write for each column + encoding - A dictionary mapping from column names to encoding + codec - A dictionary mapping from column names to compression codec """ fileio = open(path, "w") write(fileio, "PAR1") - colnames = Tables.columnnames(tbl) - ncols = length(colnames) - nrows = length(Tables.rows(tbl)) + ncols = length(itr_vectors) - # the + 1 comes from the fact that schema is tree and there is an extra + # the + 1 comes from the fact that schema is a tree and there is an extra # parent node schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1) schemas[1] = create_schema_parent_node(ncols) @@ -535,8 +540,8 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec # write the columns one by one # TODO parallelize this - for (coli, colname_sym) in enumerate(colnames) - colvals = Tables.getcolumn(tbl, colname_sym) + nrows = -1 # initialize it + for (coli, (colname_sym, colvals)) in enumerate(zip(colnames, itr_vectors)) colname = String(colname_sym) col_encoding = encoding[colname] @@ -544,9 +549,10 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec # write the data including metadata col_info = write_col(fileio, colvals, colname, col_encoding, col_codec; nchunks = nchunks) - # the `row_group_file_offset` keeps track where the data - # starts, so keep it at the dictonary of the first data + # the `row_group_file_offset` keeps track of where the data starts, so + # keep it at the dictonary of the first data if coli == 1 + nrows = length(colvals) if ismissing(col_info.dictionary_page_offset) row_group_file_offset = col_info.data_page_offset else @@ -584,13 +590,9 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec Thrift.set_field!(filemetadata, :row_groups, [row_group]) - position_before_filemetadata_write = position(fileio) - - write_thrift(fileio, filemetadata) - - filemetadata_size = position(fileio) - position_before_filemetadata_write + filemetadata_size = write_thrift(fileio, filemetadata) - write(fileio, Int32(filemetadata_size)) + write(fileio, UInt32(filemetadata_size)) write(fileio, "PAR1") close(fileio) end From 36fdd327b82de608a8142c38a98ffcbaae72e404 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 18 May 2020 11:47:27 +1000 Subject: [PATCH 23/52] Update writer.jl --- src/writer.jl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index 3d0ce5b..3bf213e 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -515,7 +515,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") ) end -function _write_parquet(itr_vectors, colnames, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32}) +function _write_parquet(itr_vectors, colnames, path, nchunks; ncols = length(itr_vectors), encoding::Dict{String, Int32}, codec::Dict{String, Int32}) """Internal method for writing parquet itr_vectors - An iterable of `AbstractVector`s containing the values to be @@ -523,14 +523,15 @@ function _write_parquet(itr_vectors, colnames, path, nchunks; encoding::Dict{Str colnames - Column names for each of the vectors path - The output parquet file path nchunks - The number of chunks/pages to write for each column + ncols - The number of columns. This is provided as an argument for + the case where the `length(itr_vectors)` is not defined, + e.g. lazy loading of remote resources. encoding - A dictionary mapping from column names to encoding codec - A dictionary mapping from column names to compression codec """ fileio = open(path, "w") write(fileio, "PAR1") - ncols = length(itr_vectors) - # the + 1 comes from the fact that schema is a tree and there is an extra # parent node schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1) From ba78cb8e4f0b974a7a0749abcabc43922a51be60 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 18 May 2020 12:09:50 +1000 Subject: [PATCH 24/52] made version number of package a constant instead of relying on the directory of the package, because relying on directory makes Parquet.jl static compilation unfriendly. --- Project.toml | 1 - src/Parquet.jl | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index be869a4..4738555 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,6 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" -Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429" Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" diff --git a/src/Parquet.jl b/src/Parquet.jl index b7d71df..ed45a0f 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -12,8 +12,7 @@ if VERSION < v"1.3" using Missings: nonmissingtype end -using Pkg -const PARQUET_JL_VERSION = VersionNumber(Pkg.TOML.parsefile(joinpath(@__DIR__, "..", "Project.toml"))["version"]) +const PARQUET_JL_VERSION = v"0.4.0" import Base: show, open, close, values, eltype, length import Thrift: isfilled From 656d503029c73d8e38c664358752f59ba703962d Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 18 May 2020 13:26:33 +1000 Subject: [PATCH 25/52] fixed bug of not writing DataFrame properly --- src/writer.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/writer.jl b/src/writer.jl index 3bf213e..afaadac 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -506,7 +506,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY") colnames = String.(Tables.columnnames(tbl)) _write_parquet( - Tables.columns(tbl), + Tables.Columns(tbl), Tables.columnnames(tbl), path, recommended_chunks; From 7eda104bd1609ea311d2967844fcca749e24445f Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 18 May 2020 13:30:51 +1000 Subject: [PATCH 26/52] updated parquet --- src/Parquet.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parquet.jl b/src/Parquet.jl index 05e9001..fa2e60e 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -11,7 +11,7 @@ if VERSION < v"1.3" using Missings: nonmissingtype end -const PARQUET_JL_VERSION = v"0.4.0" +const PARQUET_JL_VERSION = v"0.5.0" import Base: show, open, close, values, eltype, length import Thrift: isfilled From 04cca7815dd0340e13d99c4a5be147b796c62276 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 18 May 2020 14:49:08 +1000 Subject: [PATCH 27/52] removed protobuf --- Project.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Project.toml b/Project.toml index c96986d..601e254 100644 --- a/Project.toml +++ b/Project.toml @@ -26,7 +26,6 @@ DataAPI = "1" LittleEndianBase128 = "0.3" MemPool = "0.2" Missings = "0.3,0.4" -ProtoBuf = "0.7,0.8" Snappy = "0.3" Tables = "1" Thrift = "0.6,0.7" From 2a9ff1d9e81da894576db93bec10b49dbfb9d02e Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 18 May 2020 15:48:31 +1000 Subject: [PATCH 28/52] upped version to 0.5.1 --- Project.toml | 2 +- src/Parquet.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index 601e254..2c22e9d 100644 --- a/Project.toml +++ b/Project.toml @@ -3,7 +3,7 @@ uuid = "626c502c-15b0-58ad-a749-f091afb673ae" keywords = ["parquet", "julia", "columnar-storage"] license = "MIT" desc = "Julia implementation of parquet columnar file format reader and writer" -version = "0.5.0" +version = "0.5.1" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" diff --git a/src/Parquet.jl b/src/Parquet.jl index fa2e60e..313f8dd 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -11,7 +11,7 @@ if VERSION < v"1.3" using Missings: nonmissingtype end -const PARQUET_JL_VERSION = v"0.5.0" +const PARQUET_JL_VERSION = v"0.5.1" import Base: show, open, close, values, eltype, length import Thrift: isfilled From a6f2a8a3b4fed27e53ff695f46514d2bdc2c15ec Mon Sep 17 00:00:00 2001 From: tan Date: Mon, 18 May 2020 18:57:23 +0530 Subject: [PATCH 29/52] performace improvements, few fixes - fix condition for missing column values when row can not be located in a column chunk - few performance improvements --- src/codec.jl | 61 ++++++++++++------------- src/cursor.jl | 110 ++++++++++++++++++++++++--------------------- src/reader.jl | 22 ++++----- test/test_codec.jl | 8 +++- 4 files changed, 104 insertions(+), 97 deletions(-) diff --git a/src/codec.jl b/src/codec.jl index 7278ad6..80fd244 100644 --- a/src/codec.jl +++ b/src/codec.jl @@ -4,18 +4,17 @@ const MSB = 0x80 const MASK7 = 0x7f const MASK8 = 0xff const MASK3 = 0x07 -function MASKN(nbits) - T = byt2uitype_small(bit2bytewidth(nbits)) +Base.@pure function MASKN(nbits::UInt8, ::Type{T}=byt2uitype_small(bit2bytewidth(nbits))) where {T} O = convert(T, 0x1) (O << nbits) - O end -bitwidth(i) = ceil(Int, log(2, i+1)) -bytewidth(i) = bit2bytewidth(bitwidth(i)) -bit2bytewidth(i) = ceil(Int, i/8) -byt2itype(i) = (i <= 4) ? Int32 : (i <= 8) ? Int64 : Int128 -byt2uitype(i) = (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128 -byt2uitype_small(i) = (i <= 1) ? UInt8 : (i <= 2) ? UInt16 : (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128 +Base.@pure bitwidth(i::Int) = ceil(Int, log(2, i+1)) +#bytewidth(i) = bit2bytewidth(bitwidth(i)) +Base.@pure bit2bytewidth(i::UInt8) = ceil(Int, i/8) +Base.@pure byt2itype(i::Int) = (i <= 4) ? Int32 : (i <= 8) ? Int64 : Int128 +Base.@pure byt2uitype(i::Int) = (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128 +Base.@pure byt2uitype_small(i::Int) = (i <= 1) ? UInt8 : (i <= 2) ? UInt16 : (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128 read_fixed(io::IO, typ::Type{UInt32}) = _read_fixed(io, convert(UInt32,0), 4) read_fixed(io::IO, typ::Type{UInt64}) = _read_fixed(io, convert(UInt64,0), 8) @@ -72,19 +71,19 @@ function read_plain(io::IO, typ::Int32, jtype::Type{T}=PLAIN_JTYPES[typ+1]) wher end # read plain values or dictionary (PLAIN_DICTIONARY = 2) -function read_plain_values(io::IO, count::Integer, typ::Int32) - @debug("reading plain values", type=typ, count=count) +function read_plain_values(io::IO, count::Int32, typ::Int32) + #@debug("reading plain values", type=typ, count=count) if typ == _Type.BOOLEAN arr = read_bitpacked_booleans(io, count) else arr = [read_plain(io, typ) for i in 1:count] end - @debug("read $(length(arr)) plain values") + #@debug("read $(length(arr)) plain values") arr end -function read_bitpacked_booleans(io::IO, count::Integer) #, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count); read_len::Bool=true) where {T <: Integer} - @debug("reading bitpacked booleans", count) +function read_bitpacked_booleans(io::IO, count::Int32) + #@debug("reading bitpacked booleans", count) arr = falses(count) arrpos = 1 bits = UInt8(0) @@ -92,7 +91,7 @@ function read_bitpacked_booleans(io::IO, count::Integer) #, bits::Integer, byt:: while arrpos <= count if bitpos > 8 bits = read(io, UInt8) - @debug("bits", bits, bitstring(bits)) + #@debug("bits", bits, bitstring(bits)) bitpos = 1 end arr[arrpos] = Bool(bits & 0x1) @@ -104,16 +103,16 @@ function read_bitpacked_booleans(io::IO, count::Integer) #, bits::Integer, byt:: end # read rle dictionary (RLE_DICTIONARY = 8, or PLAIN_DICTIONARY = 2 in a data page) -function read_rle_dict(io::IO, count::Integer) +function read_rle_dict(io::IO, count::Int32) bits = read(io, UInt8) - @debug("reading rle dictionary bits:$bits") + #@debug("reading rle dictionary bits:$bits") arr = read_hybrid(io, count, bits; read_len=false) - @debug("read $(length(arr)) dictionary values") + #@debug("read $(length(arr)) dictionary values") arr end # read RLE or bit backed format (RLE = 3) -function read_hybrid(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count); read_len::Bool=true) where {T <: Integer} +function read_hybrid(io::IO, count::Int32, bits::UInt8, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count); read_len::Bool=true) where {T <: Integer} len = read_len ? read_fixed(io, Int32) : Int32(0) @debug("reading hybrid data", len, count, bits) arrpos = 1 @@ -137,13 +136,13 @@ function read_hybrid(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewid arr end -function read_rle_run(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(count)) where {T <: Integer} - @debug("read_rle_run. count:$count, typ:$T, nbits:$bits, nbytes:$byt") +function read_rle_run(io::IO, count::Int, bits::UInt8, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(count)) where {T <: Integer} + @debug("read_rle_run", count, T, bits, byt) arr[1:count] .= reinterpret(T, _read_fixed(io, zero(byt2uitype(byt)), byt)) arr end -function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, grp_count*8)) where {T <: Integer} +function read_bitpacked_run(io::IO, grp_count::Int, bits::UInt8, byt::Int, typ::Type{T}, arr::Vector{T}, mask::V=MASKN(bits)) where {T <: Integer, V <: Integer} count = min(grp_count * 8, length(arr)) # multiple of 8 values at a time are bit packed together nbytes = bits * grp_count # same as: round(Int, (bits * grp_count * 8) / 8) @@ -151,11 +150,9 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int= data = Array{UInt8}(undef, min(nbytes, bytesavailable(io))) read!(io, data) - mask = MASKN(bits) - V = typeof(mask) bitbuff = zero(V) - nbitsbuff = 0 - shift = 0 + nbitsbuff = UInt8(0) + shift = UInt8(0) arridx = 1 dataidx = 1 @@ -165,9 +162,9 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int= # we have leftover bits, which must be appended if nbitsbuff < bits # but only append if we need to read more in this cycle - arr[arridx] = bitbuff & MASKN(nbitsbuff) + @inbounds arr[arridx] = bitbuff & MASKN(nbitsbuff, V) shift = nbitsbuff - nbitsbuff = 0 + nbitsbuff = UInt8(0) bitbuff = zero(V) end end @@ -177,7 +174,7 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int= # shift 8 bits and read directly into bitbuff bitbuff |= (V(data[dataidx]) << nbitsbuff) dataidx += 1 - nbitsbuff += 8 + nbitsbuff += UInt8(8) end # set values @@ -188,7 +185,7 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int= arr[arridx] |= convert(T, (bitbuff << shift) & mask) bitbuff >>= remshift nbitsbuff -= remshift - shift = 0 + shift = UInt8(0) else #@debug("setting all from bitbuff nbitsbuff:$nbitsbuff") arr[arridx] = convert(T, bitbuff & mask) @@ -202,17 +199,15 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int= end # read bit packed in deprecated format (BIT_PACKED = 4) -function read_bitpacked_run_old(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count)) where {T <: Integer} +function read_bitpacked_run_old(io::IO, count::Int, bits::UInt8, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count), mask::V=MASKN(bits)) where {T <: Integer, V <: Integer} # multiple of 8 values at a time are bit packed together nbytes = round(Int, (bits * count) / 8) - @debug("read_bitpacked_run. count:$count, nbytes:$nbytes, nbits:$bits") + #@debug("read_bitpacked_run. count:$count, nbytes:$nbytes, nbits:$bits") data = Array{UInt8}(undef, nbytes) read!(io, data) # the mask is of the smallest bounding type for bits # T is one of the types that map on to the appropriate Julia type in Parquet (which may be larger than the mask type) - mask = MASKN(bits) - V = typeof(mask) bitbuff = zero(V) nbitsbuff = 0 diff --git a/src/cursor.jl b/src/cursor.jl index bfc6d43..1c7bf72 100644 --- a/src/cursor.jl +++ b/src/cursor.jl @@ -127,7 +127,7 @@ function setrow(cursor::ColCursor{T}, row::Int64) where {T} end # find the column chunk with the row - if cursor.ccrange===nothing || !(row in cursor.ccrange) + if (cursor.ccrange === nothing) || !(row in cursor.ccrange) offset = rowgroup_offset(cursor.row) # the offset of row from beginning of current rowgroup colchunks = cursor.colchunks @@ -137,7 +137,7 @@ function setrow(cursor::ColCursor{T}, row::Int64) where {T} if isempty(repn_levels) nrowscc = length(vals) # number of values is number of rows else - nrowscc = length(repn_levels) - length(find(repn_levels)) # number of values where repeation level is 0 + nrowscc = length(repn_levels) - length(find(repn_levels)) # number of values where repetition level is 0 end ccrange = startrow:(startrow + nrowscc) @@ -156,49 +156,55 @@ function setrow(cursor::ColCursor{T}, row::Int64) where {T} end end - # find the starting positions for values and levels - ccrange = cursor.ccrange - defn_levels = cursor.defn_levels - repn_levels = cursor.repn_levels - levelpos = valpos = Int64(0) - - # compute the level and value pos for row - if isempty(repn_levels) - # no repetitions, so each entry corresponds to one full row - levelpos = row - first(ccrange) + 1 - levelrange = levelpos:levelpos + if cursor.ccrange === nothing + # we did not find the row in this column + cursor.valpos = cursor.levelpos = 0 + cursor.levelrange = 0:-1 #cursor.valrange = 0:-1 else - # multiple entries may constitute one row - idx = first(ccrange) - levelpos = findfirst(repn_levels, 0) # NOTE: can start from cursor.levelpos to optimize, but that will prevent using setrow to go backwards - while idx < row - levelpos = findnext(repn_levels, 0, levelpos+1) - idx += 1 + # find the starting positions for values and levels + ccrange = cursor.ccrange + defn_levels = cursor.defn_levels + repn_levels = cursor.repn_levels + levelpos = valpos = Int64(0) + + # compute the level and value pos for row + if isempty(repn_levels) + # no repetitions, so each entry corresponds to one full row + levelpos = row - first(ccrange) + 1 + levelrange = levelpos:levelpos + else + # multiple entries may constitute one row + idx = first(ccrange) + levelpos = findfirst(repn_levels, 0) # NOTE: can start from cursor.levelpos to optimize, but that will prevent using setrow to go backwards + while idx < row + levelpos = findnext(repn_levels, 0, levelpos+1) + idx += 1 + end + levelend = max(findnext(repn_levels, 0, levelpos+1)-1, length(repn_levels)) + levelrange = levelpos:levelend end - levelend = max(findnext(repn_levels, 0, levelpos+1)-1, length(repn_levels)) - levelrange = levelpos:levelend - end - # compute the val pos for row - if isempty(defn_levels) - # all entries are required, so there must be a corresponding value - valpos = levelpos - #valrange = levelrange - else - maxdefn = cursor.maxdefn - if ccincr - valpos = cursor.valpos + # compute the val pos for row + if isempty(defn_levels) + # all entries are required, so there must be a corresponding value + valpos = levelpos + #valrange = levelrange else - valpos = sum(view(defn_levels, 1:(levelpos-1)) .== maxdefn) + 1 + maxdefn = cursor.maxdefn + if ccincr + valpos = cursor.valpos + else + valpos = sum(view(defn_levels, 1:(levelpos-1)) .== maxdefn) + 1 + end + #nvals = sum(sub(defn_levels, levelrange) .== maxdefn) + #valrange = valpos:(valpos+nvals-1) end - #nvals = sum(sub(defn_levels, levelrange) .== maxdefn) - #valrange = valpos:(valpos+nvals-1) - end - cursor.levelpos = levelpos - cursor.levelrange = levelrange - cursor.valpos = valpos - #cursor.valrange = valrange + cursor.levelpos = levelpos + cursor.levelrange = levelrange + cursor.valpos = valpos + #cursor.valrange = valrange + end nothing end @@ -209,7 +215,7 @@ function _start(cursor::ColCursor) end function _done(cursor::ColCursor, rowandlevel::Tuple{Int64,Int64}) row, levelpos = rowandlevel - (levelpos > last(cursor.levelrange)) && _done(cursor.row, row) + (levelpos > last(cursor.levelrange)) || _done(cursor.row, row) end function _next(cursor::ColCursor{T}, rowandlevel::Tuple{Int64,Int64}) where {T} # find values for current row and level in row @@ -253,28 +259,27 @@ mutable struct RecordCursor{T} colnames::Vector{Vector{String}} colcursors::Vector{ColCursor} colstates::Vector{Tuple{Int64,Int64}} + rows::UnitRange{Int64} # rows to scan over + row::Int64 # current row end function RecordCursor(par::ParFile; rows::UnitRange=1:nrows(par), colnames::Vector{Vector{String}}=colnames(par), row::Signed=first(rows)) colcursors = [ColCursor(par, UnitRange{Int64}(rows), colname, Int64(row)) for colname in colnames] sch = schema(par) rectype = ntelemtype(sch, sch.schema[1]) - RecordCursor{rectype}(par, colnames, colcursors, Array{Tuple{Int64,Int64}}(undef, length(colcursors))) + RecordCursor{rectype}(par, colnames, colcursors, Array{Tuple{Int64,Int64}}(undef, length(colcursors)), rows, row) end eltype(cursor::RecordCursor{T}) where {T} = T -length(cursor::RecordCursor) = length(first(cursor.colcursors).row.rows) +length(cursor::RecordCursor) = length(cursor.rows) -function state(cursor::RecordCursor) - col1_row, _col1_level = first(cursor.colstates) - col1_row # return row as state, picked up from the state of first column -end +state(cursor::RecordCursor) = cursor.row function _start(cursor::RecordCursor) cursor.colstates = [_start(colcursor) for colcursor in cursor.colcursors] state(cursor) end -_done(cursor::RecordCursor, row::Int64) = _done(cursor.colcursors[1].row, row) +_done(cursor::RecordCursor, row::Int64) = (row > last(cursor.rows)) function _next(cursor::RecordCursor{T}, _row::Int64) where {T} states = cursor.colstates @@ -284,11 +289,14 @@ function _next(cursor::RecordCursor{T}, _row::Int64) where {T} col_repeat_state = Dict{AbstractString,Int}() for colid in 1:length(states) # for each column colcursor = cursors[colid] - colval, colstate = _next(colcursor, states[colid]) # for each value, defn level, repn level in column - val, def, rep = colval - update_record(cursor.par, row, colcursor.colname, val, def, rep, col_repeat_state) # update record - states[colid] = colstate # set last state to states + if !_done(colcursor, states[colid]) + colval, colstate = _next(colcursor, states[colid]) # for each value, defn level, repn level in column + val, def, rep = colval + update_record(cursor.par, row, colcursor.colname, val, def, rep, col_repeat_state) # update record + states[colid] = colstate # set last state to states + end end + cursor.row += 1 _nt(row, T), state(cursor) end diff --git a/src/reader.jl b/src/reader.jl index 6cd5f74..7e27869 100644 --- a/src/reader.jl +++ b/src/reader.jl @@ -130,13 +130,13 @@ function rowgroups(par::ParFile, cnames::Vector{Vector{String}}, rowrange::UnitR cnamesrg = colnames(rowgrp) found = length(intersect(cnames, cnamesrg)) endrow = beginrow + rowgrp.num_rows - 1 - (found == L) && (length(intersect(beginrow:endrow)) > 0) && push!(R, rowgrp) + (found == L) && (length(beginrow:endrow) > 0) && push!(R, rowgrp) beginrow = endrow + 1 end R end -columns(par::ParFile, rowgroupidx::Integer) = columns(par, rowgroups(par)[rowgroupidx]) +columns(par::ParFile, rowgroupidx) = columns(par, rowgroups(par)[rowgroupidx]) columns(par::ParFile, rowgroup::RowGroup) = rowgroup.columns columns(par::ParFile, rowgroup::RowGroup, colname::Vector{String}) = columns(par, rowgroup, [colname]) function columns(par::ParFile, rowgroup::RowGroup, cnames::Vector{Vector{String}}) @@ -166,8 +166,8 @@ function _pagevec(par::ParFile, col::ColumnChunk) end pagevec end -pages(par::ParFile, rowgroupidx::Integer, colidx::Integer) = pages(par, columns(par, rowgroupidx), colidx) -pages(par::ParFile, cols::Vector{ColumnChunk}, colidx::Integer) = pages(par, cols[colidx]) +pages(par::ParFile, rowgroupidx, colidx) = pages(par, columns(par, rowgroupidx), colidx) +pages(par::ParFile, cols::Vector{ColumnChunk}, colidx) = pages(par, cols[colidx]) pages(par::ParFile, col::ColumnChunk) = cacheget(par.page_cache, col, col->_pagevec(par,col)) function bytes(page::Page, uncompressed::Bool=true) @@ -195,8 +195,8 @@ end map_dict_vals(valdict::Vector{T1}, vals::Vector{T2}) where {T1, T2} = isempty(valdict) ? vals : [valdict[v+1] for v in vals] -values(par::ParFile, rowgroupidx::Integer, colidx::Integer) = values(par, columns(par, rowgroupidx), colidx) -values(par::ParFile, cols::Vector{ColumnChunk}, colidx::Integer) = values(par, cols[colidx]) +values(par::ParFile, rowgroupidx, colidx) = values(par, columns(par, rowgroupidx), colidx) +values(par::ParFile, cols::Vector{ColumnChunk}, colidx) = values(par, cols[colidx]) function values(par::ParFile, col::ColumnChunk) ctype = coltype(col) pgs = pages(par, col) @@ -232,8 +232,8 @@ function values(par::ParFile, col::ColumnChunk) vals, defn_levels, repn_levels end -function read_levels(io::IO, max_val::Integer, enc::Int32, num_values::Integer) - bw = bitwidth(max_val) +function read_levels(io::IO, max_val::Int, enc::Int32, num_values::Int32) + bw = UInt8(bitwidth(max_val)) (bw == 0) && (return Int[]) @debug("reading levels. enc:$enc ($(Thrift.enumstr(Encoding,enc))), max_val:$max_val, num_values:$num_values") @@ -249,7 +249,7 @@ function read_levels(io::IO, max_val::Integer, enc::Int32, num_values::Integer) end end -function read_values(io::IO, enc::Int32, typ::Int32, num_values::Integer) +function read_values(io::IO, enc::Int32, typ::Int32, num_values::Int32) @debug("reading values. enc:$enc ($(Thrift.enumstr(Encoding,enc))), num_values:$num_values") if enc == Encoding.PLAIN @@ -280,7 +280,7 @@ function values(par::ParFile, page::Page) end end -function read_levels_and_values(io::IO, encs::Tuple, ctype::Int32, num_values::Integer, par::ParFile, page::Page) +function read_levels_and_values(io::IO, encs::Tuple, ctype::Int32, num_values::Int32, par::ParFile, page::Page) cname = colname(page.colchunk) enc, defn_enc, rep_enc = encs @@ -298,7 +298,7 @@ function read_levels_and_values(io::IO, encs::Tuple, ctype::Int32, num_values::I # where defn_levels's elements == 1 are present and only # sum(defn_levels) values can be read. # because defn_levels == 0 are where the missing vlaues are - nmissing = sum(==(0), defn_levels) + nmissing = Int32(sum(==(0), defn_levels)) vals = read_values(io, enc, ctype, num_values - nmissing) vals, defn_levels, repn_levels diff --git a/test/test_codec.jl b/test/test_codec.jl index 329d01a..cadc3fc 100644 --- a/test/test_codec.jl +++ b/test/test_codec.jl @@ -5,7 +5,7 @@ function test_codec() println("testing reading bitpacked run (old scheme)...") let data = UInt8[0x05, 0x39, 0x77] io = PipeBuffer(data) - decoded = Parquet.read_bitpacked_run_old(io, 8, 3) + decoded = Parquet.read_bitpacked_run_old(io, 8, UInt8(3)) @test decoded == Int32[0:7;] end println("passed.") @@ -13,7 +13,11 @@ function test_codec() println("testing reading bitpacked run...") let data = UInt8[0x88, 0xc6, 0xfa] io = PipeBuffer(data) - decoded = Parquet.read_bitpacked_run(io, 1, 3) + bits = UInt8(3) + byt = Parquet.bit2bytewidth(bits) + itype = Parquet.byt2itype(byt) + arr = Array{itype}(undef, 8) + decoded = Parquet.read_bitpacked_run(io, 1, bits, byt, itype, arr) @test decoded == Int32[0:7;] end println("passed.") From 0a822aeb5582618bec4d9795537115b68ec34404 Mon Sep 17 00:00:00 2001 From: tan Date: Tue, 19 May 2020 12:46:17 +0530 Subject: [PATCH 30/52] more performance fixes --- src/cursor.jl | 48 ++++++++++++++++++++++++++++-------------------- src/reader.jl | 8 ++++++++ src/schema.jl | 22 +++++++++++----------- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/src/cursor.jl b/src/cursor.jl index 1c7bf72..71e46c2 100644 --- a/src/cursor.jl +++ b/src/cursor.jl @@ -227,10 +227,10 @@ function _next(cursor::ColCursor{T}, rowandlevel::Tuple{Int64,Int64}) where {T} repn_level = isempty(cursor.repn_levels) ? 0 : cursor.repn_levels[levelpos] cursor.levelpos += 1 if defn_level == maxdefn - val = (cursor.vals[cursor.valpos])::Union{Nothing,T} + val = (cursor.vals[cursor.valpos])::T cursor.valpos += 1 else - val = (nothing)::Union{Nothing,T} + val = nothing end # advance row @@ -239,10 +239,10 @@ function _next(cursor::ColCursor{T}, rowandlevel::Tuple{Int64,Int64}) where {T} setrow(cursor, row) end - (val, defn_level, repn_level), (row, cursor.levelpos) + NamedTuple{(:value, :defn_level, :repn_level),Tuple{Union{Nothing,T},Int64,Int64}}((val, defn_level, repn_level)), (row, cursor.levelpos) end -function Base.iterate(cursor::ColCursor, state) +function Base.iterate(cursor::ColCursor{T}, state) where {T} _done(cursor, state) && return nothing return _next(cursor, state) end @@ -286,15 +286,11 @@ function _next(cursor::RecordCursor{T}, _row::Int64) where {T} cursors = cursor.colcursors row = Dict{Symbol,Any}() - col_repeat_state = Dict{AbstractString,Int}() + col_repeat_state = Dict{Tuple{Int,Int},Int}() for colid in 1:length(states) # for each column colcursor = cursors[colid] - if !_done(colcursor, states[colid]) - colval, colstate = _next(colcursor, states[colid]) # for each value, defn level, repn level in column - val, def, rep = colval - update_record(cursor.par, row, colcursor.colname, val, def, rep, col_repeat_state) # update record - states[colid] = colstate # set last state to states - end + colstate = states[colid] + states[colid] = update_record(cursor.par, row, colid, colcursor, colstate, col_repeat_state) end cursor.row += 1 _nt(row, T), state(cursor) @@ -310,20 +306,32 @@ function Base.iterate(cursor::RecordCursor{T}) where {T} return r end -function _nt(dict::Dict{Symbol,Any}, ::Type{T}) where {T} - _val_or_missing = (idx,k) -> begin - v = get(dict, k, missing) - isa(v, Dict{Symbol,Any}) ? _nt(v, T.types[idx]) : v +function _val_or_missing(dict::Dict{Symbol,Any}, k::Symbol, ::Type{T}) where {T} + v = get(dict, k, missing) + (isa(v, Dict{Symbol,Any}) ? _nt(v, T) : v)::T +end + +@generated function _nt(dict::Dict{Symbol,Any}, ::Type{T}) where {T} + names = fieldnames(T) + strnames = ["$n" for n in names] + quote + return T(($([:(_val_or_missing(dict,Symbol($(strnames[i])),$(fieldtype(T,i)))) for i in 1:length(names)]...),)) end - values = [_val_or_missing(idx,k) for (idx,k) in enumerate(T.names)] - T((values...,)) end default_init(::Type{Vector{T}}) where {T} = Vector{T}() default_init(::Type{Dict{Symbol,Any}}) = Dict{Symbol,Any}() default_init(::Type{T}) where {T} = ccall(:jl_new_struct_uninit, Any, (Any,), T)::T -function update_record(par::ParFile, row::Dict{Symbol,Any}, nameparts::Vector{String}, val, defn_level::Signed, repn_level::Signed, col_repeat_state::Dict{AbstractString,Int}) +function update_record(par::ParFile, row::Dict{Symbol,Any}, colid::Int, colcursor::ColCursor{T}, colcursor_state::Tuple{Int64,Int64}, col_repeat_state::Dict{Tuple{Int,Int},Int}) where {T} + if !_done(colcursor, colcursor_state) + colval, colcursor_state = _next(colcursor, colcursor_state) # for each value, defn level, repn level in column + update_record(par, row, colid, colcursor.colname, colval.value, colval.defn_level, colval.repn_level, col_repeat_state) # update record + end + colcursor_state # return new colcursor state +end + +function update_record(par::ParFile, row::Dict{Symbol,Any}, colid::Int, nameparts::Vector{String}, val, defn_level::Int64, repn_level::Int64, col_repeat_state::Dict{Tuple{Int,Int},Int}) lparts = length(nameparts) sch = par.schema F = row # the current field corresponding to the level in nameparts @@ -332,7 +340,7 @@ function update_record(par::ParFile, row::Dict{Symbol,Any}, nameparts::Vector{St # for each name part of colname (a field) for idx in 1:lparts - colname = nameparts[1:idx] + colname = view(nameparts, 1:idx) #@debug("updating part $colname of $nameparts isnull:$(val === nothing), def:$(defn_level), rep:$(repn_level)") leaf = nameparts[idx] symleaf = Symbol(leaf) @@ -345,7 +353,7 @@ function update_record(par::ParFile, row::Dict{Symbol,Any}, nameparts::Vector{St defined = ((val === nothing) || (idx < lparts)) ? haskey(F, symleaf) : false mustdefine = defn_level >= Fdefn mustrepeat = repeated && (repn_level == Frepn) - repkey = join(nameparts, '.') * ":" * join(colname, '.') + repkey = (colid, idx) #join(nameparts, '.') * ":" * string(idx) #join(colname, '.') repidx = get(col_repeat_state, repkey, 0) if mustrepeat repidx += 1 diff --git a/src/reader.jl b/src/reader.jl index 7e27869..46449b0 100644 --- a/src/reader.jl +++ b/src/reader.jl @@ -218,6 +218,14 @@ function values(par::ParFile, col::ColumnChunk) enc, defn_enc, rep_enc = page_encodings(pg) if enc == Encoding.PLAIN_DICTIONARY || enc == Encoding.RLE_DICTIONARY append!(vals, map_dict_vals(valdict, _vals)) + #= + if isempty(valdict) + append!(vals, _vals) + else + mapped_vals = [valdict[v+1] for v in _vals] + append!(vals, mapped_vals) + end + =# else append!(vals, _vals) end diff --git a/src/schema.jl b/src/schema.jl index fe360c8..7776a92 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -41,23 +41,23 @@ mutable struct Schema end end -leafname(schname::Vector{String}) = [schname[end]] +leafname(schname::T) where {T <: AbstractVector{String}} = [schname[end]] -parentname(schname::Vector{String}) = istoplevel(schname) ? schname : schname[1:(end-1)] +parentname(schname::T) where {T <: AbstractVector{String}} = istoplevel(schname) ? schname : schname[1:(end-1)] istoplevel(schname::Vector) = !(length(schname) > 1) -elem(sch::Schema, schname::Vector{String}) = sch.name_lookup[schname] +elem(sch::Schema, schname::T) where {T <: AbstractVector{String}} = sch.name_lookup[schname] isrepetitiontype(schelem::SchemaElement, repetition_type) = Thrift.isfilled(schelem, :repetition_type) && (schelem.repetition_type == repetition_type) -isrequired(sch::Schema, schname::Vector{String}) = isrequired(elem(sch, schname)) +isrequired(sch::Schema, schname::T) where {T <: AbstractVector{String}} = isrequired(elem(sch, schname)) isrequired(schelem::SchemaElement) = isrepetitiontype(schelem, FieldRepetitionType.REQUIRED) -isoptional(sch::Schema, schname::Vector{String}) = isoptional(elem(sch, schname)) +isoptional(sch::Schema, schname::T) where {T <: AbstractVector{String}} = isoptional(elem(sch, schname)) isoptional(schelem::SchemaElement) = isrepetitiontype(schelem, FieldRepetitionType.OPTIONAL) -isrepeated(sch::Schema, schname::Vector{String}) = isrepeated(elem(sch, schname)) +isrepeated(sch::Schema, schname::T) where {T <: AbstractVector{String}} = isrepeated(elem(sch, schname)) isrepeated(schelem::SchemaElement) = isrepetitiontype(schelem, FieldRepetitionType.REPEATED) function path_in_schema(sch::Schema, schelem::SchemaElement) @@ -67,7 +67,7 @@ function path_in_schema(sch::Schema, schelem::SchemaElement) error("schema element not found in schema") end -function logical_convert(sch::Schema, schname::Vector{String}, val) +function logical_convert(sch::Schema, schname::T, val) where {T <: AbstractVector{String}} elem = sch.name_lookup[schname] if schname in keys(sch.map_logical_types) @@ -81,7 +81,7 @@ function logical_convert(sch::Schema, schname::Vector{String}, val) end end -elemtype(sch::Schema, schname::Vector{String}) = get!(sch.type_lookup, schname) do +elemtype(sch::Schema, schname::T) where {T <: AbstractVector{String}} = get!(sch.type_lookup, schname) do elem = sch.name_lookup[schname] if schname in keys(sch.map_logical_types) @@ -111,7 +111,7 @@ function elemtype(schelem::SchemaElement) jtype end -ntelemtype(sch::Schema, schname::Vector{String}) = get!(sch.nttype_lookup, schname) do +ntelemtype(sch::Schema, schname::T) where {T <: AbstractVector{String}} = get!(sch.nttype_lookup, schname) do ntelemtype(sch, sch.name_lookup[schname]) end function ntelemtype(sch::Schema, schelem::SchemaElement) @@ -130,12 +130,12 @@ bit_or_byte_length(schelem::SchemaElement) = Thrift.isfilled(schelem, :type_leng num_children(schelem::SchemaElement) = Thrift.isfilled(schelem, :num_children) ? schelem.num_children : 0 -function max_repetition_level(sch::Schema, schname::Vector{String}) +function max_repetition_level(sch::Schema, schname::T) where {T <: AbstractVector{String}} lev = isrepeated(sch, schname) ? 1 : 0 istoplevel(schname) ? lev : (lev + max_repetition_level(sch, parentname(schname))) end -function max_definition_level(sch::Schema, schname::Vector{String}) +function max_definition_level(sch::Schema, schname::T) where {T <: AbstractVector{String}} lev = isrequired(sch, schname) ? 0 : 1 istoplevel(schname) ? lev : (lev + max_definition_level(sch, parentname(schname))) end From d4f8a94cc1d5ced264074c74f21e071b0282cae1 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Tue, 19 May 2020 21:18:37 +1000 Subject: [PATCH 31/52] minor --- .gitignore | 6 ++++ Project.toml | 2 ++ src/Parquet.jl | 5 +++ src/column_reader.jl | 79 ++++++++++++++++++++++++++++++++++++++++++++ src/encoding.jl | 14 ++++++++ src/metadata.jl | 13 ++++++++ src/read_parquet.jl | 56 +++++++++++++++++++++++++++++++ 7 files changed, 175 insertions(+) create mode 100644 src/column_reader.jl create mode 100644 src/encoding.jl create mode 100644 src/metadata.jl create mode 100644 src/read_parquet.jl diff --git a/.gitignore b/.gitignore index c4f35ef..0d9aedb 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,9 @@ parquet-compatibility/ julia-parquet-compatibility/ .vscode/settings.json +Manifest.toml +parquet.code-workspace +src/column_reader_rewrite.jl +src/column_reader_to_vals.jl +src/column_reader-dev.jl +src/read_parquet-test.jl diff --git a/Project.toml b/Project.toml index 2c22e9d..070aa30 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,8 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +NamedTupleTools = "d9ec5142-1e00-5aa0-9d6a-321866360f50" +ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22" diff --git a/src/Parquet.jl b/src/Parquet.jl index 313f8dd..1e84987 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -22,6 +22,7 @@ export logical_timestamp, logical_string export RecordCursor export write_parquet +export read_parquet # package code goes here include("PAR2/PAR2.jl") @@ -32,5 +33,9 @@ include("reader.jl") include("cursor.jl") include("show.jl") include("writer.jl") +include("encoding.jl") +include("metadata.jl") +include("column_reader.jl") +include("read_parquet.jl") end # module diff --git a/src/column_reader.jl b/src/column_reader.jl new file mode 100644 index 0000000..d91d20a --- /dev/null +++ b/src/column_reader.jl @@ -0,0 +1,79 @@ + +const TYPES = (Bool, Int32, Int64, Int128, Float32, Float64, String, UInt8) + +read_column(path, col_num) = read_column(path, metadata(path), col_num) + +function read_column(path, filemetadata, col_num) + par = ParFile(path) + + T = TYPES[filemetadata.schema[col_num+1]._type+1] + # TODO detect if missing is necessary + res = Vector{Union{Missing, T}}(missing, nrows(par)) + write_cursor = 1 + for row_group in filemetadata.row_groups + pgs = pages(par, row_group.columns[col_num]) + + drop_page_count = 0 + # is the first page a dictionary page + # this is not the case for boolean values for example + if isfilled(pgs[1].hdr, :dictionary_page_header) + # the first page is almost always the dictionary page + dictionary_page = pgs[1] + drop_page_count = 1 + dictionary_of_values = T.(values(par, dictionary_page)[1]) + end + + # TODO deal with other types of pages e.g. dataheaderv2 + + # everything after the first data datapages + for data_page in Base.Iterators.drop(pgs, drop_page_count) + vals, definitions, decode = values(par, data_page) + + @assert all(in((0, 1)), definitions) + + l = sum(==(1), definitions) + # if all definitions values are 1 then it's not used + definitions_not_used = all(==(1), definitions) + + # data_page can be either + # * dictionary-encoded in which case we should look into the dictionary + # * plained-encoded in which case just return the values + page_encoding = Parquet.page_encoding(data_page) + + if page_encoding == Encoding.PLAIN_DICTIONARY + if definitions_not_used + res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1] + else + val_index = 1 + for (offset, definition) in enumerate(definitions) + if definition != 0 + value = vals[val_index] + res[write_cursor+offset-1] = dictionary_of_values[value + 1] + val_index += 1 + end + end + end + elseif page_encoding == Encoding.PLAIN + if definitions_not_used + res[write_cursor:write_cursor+l-1] .= T.(vals) + else + val_index = 1 + for (offset, definition) in enumerate(definitions) + if definition != 0 + value = vals[val_index] + res[write_cursor+offset-1] = T(value) + val_index += 1 + end + end + end + else + error("page encoding not supported yet") + end + + write_cursor += length(definitions) + end + end + return res +end + +() diff --git a/src/encoding.jl b/src/encoding.jl new file mode 100644 index 0000000..8efd5b6 --- /dev/null +++ b/src/encoding.jl @@ -0,0 +1,14 @@ +# obtain the encoding of the page +using Thrift: isfilled + +function page_encoding(page::Page) + if isfilled(page.hdr, :data_page_header) + return page.hdr.data_page_header.encoding + elseif isfilled(page.hdr, :data_page_header_v2) + return page.hdr.data_page_header_v2.encoding + elseif isfilled(page.hdr, :dictionary_page_header) + return page.hdr.dictionary_page_header.encoding + else + error("not supported page") + end +end diff --git a/src/metadata.jl b/src/metadata.jl new file mode 100644 index 0000000..1c8c5af --- /dev/null +++ b/src/metadata.jl @@ -0,0 +1,13 @@ +using Thrift + +function metadata(path) + io = open(path) + sz = filesize(io) + seek(io, sz - SZ_PAR_MAGIC - SZ_FOOTER) + + # read footer size as little endian signed Int32 + meta_len = read(io, Int32) + datasize = sz - meta_len - 2SZ_PAR_MAGIC - SZ_FOOTER + seek(io, SZ_PAR_MAGIC + datasize) + filemetadata = read_thrift(io, PAR2.FileMetaData) +end diff --git a/src/read_parquet.jl b/src/read_parquet.jl new file mode 100644 index 0000000..4916a6c --- /dev/null +++ b/src/read_parquet.jl @@ -0,0 +1,56 @@ +using Base.Threads: @spawn +using Base.Iterators: drop +using ProgressMeter: @showprogress +using NamedTupleTools: namedtuple + +read_parquet(path, cols::Vector{Symbol}; kwargs...) = read_parquet(path, String.(cols); kwargs...) + +read_parquet(path; kwargs...) = read_parquet(path, String[]; kwargs...) + +function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose = false) + """function for reading parquet""" + + if multithreaded + # use a bounded channel to limit + c1 = Channel{Bool}(Threads.nthreads()) + atexit(()->close(c1)) + end + + nc = ncols(ParFile(path)) + + colnames = [sch.name for sch in drop(ParFile(path).schema.schema, 1)] + + if length(cols) == 0 + colnums = collect(1:nc) + else + colnums = [findfirst(==(c), colnames) for c in cols] + end + + results = Vector{Any}(undef, length(colnums)) + + filemetadata = metadata(path) + + if multithreaded + @showprogress for (i, j) in enumerate(colnums) + put!(c1, true) + results[i] = @spawn begin + res = read_column(path, filemetadata, j) + take!(c1) + res + end + end + else + @showprogress for (i, j) in enumerate(colnums) + results[i] = read_column(path, filemetadata, j) + end + end + + symbol_col_names = collect(Symbol(col) for col in colnames[colnums]) + + if multithreaded + fnl_results = collect(fetch(result) for result in results) + return namedtuple(symbol_col_names, fnl_results) + else + return namedtuple(symbol_col_names, results) + end +end From 8432d5c6df693404f9ee6e605b7c992f4bc017c6 Mon Sep 17 00:00:00 2001 From: evalparse Date: Tue, 19 May 2020 21:21:59 +1000 Subject: [PATCH 32/52] Update README.md --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 9a42604..8012ec4 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,16 @@ ## Reader +### High level reader + +You can read a parquet file using `read_parquet` for example + +``` +df = read_parquet(parquet_file_path); +``` + +### Lower level reader + Load a [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet). Only metadata is read initially, data is loaded in chunks on demand. (Note: [ParquetFiles.jl](https://github.com/queryverse/ParquetFiles.jl) also provides load support for Parquet files under the FileIO.jl package.) `ParFile` represents a Parquet file at `path` open for reading. Options to map logical types can be provided via `map_logical_types`. From fb2b3c2f57ffc2a40ba4efda0f7704fcd35c92a8 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Thu, 21 May 2020 22:52:28 +1000 Subject: [PATCH 33/52] tries to accomodate master --- test/test_writer.jl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/test_writer.jl b/test/test_writer.jl index bfb93b7..2b5ae7e 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -33,10 +33,12 @@ function test_write() # the file is very small so only one rowgroup col_chunks = columns(pf, 1) - for colnum in 1:length(col_chunks) + + for (colnum, col_chunk) in enumerate(col_chunks) + println(colnum) correct_vals = tbl[colnum] coltype = eltype(correct_vals) - vals_from_file = values(pf, col_chunks, colnum) + vals_from_file = values(pf, col_chunk) if Missing <: coltype @test ismissing.(correct_vals) == (vals_from_file[2] .== 0) end @@ -44,7 +46,9 @@ function test_write() if nonmissingtype(coltype) == String @test all(skipmissing(correct_vals) .== String.(vals_from_file[1])) else - @test all(skipmissing(correct_vals) .== vals_from_file[1]) + non_missing_vals = collect(skipmissing(correct_vals)) + non_missing_vals_read = vals_from_file[1][1:sum(vals_from_file[2])] + @test all(non_missing_vals .== non_missing_vals_read) end end @@ -52,4 +56,4 @@ function test_write() close(pf) end -test_write() +# test_write() From 6b7bd64bb001498f477eb890ca2bd2319e9476b1 Mon Sep 17 00:00:00 2001 From: evalparse Date: Fri, 22 May 2020 09:49:17 +1000 Subject: [PATCH 34/52] Update test/test_writer.jl --- test/test_writer.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_writer.jl b/test/test_writer.jl index 2b5ae7e..dddafc1 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -35,7 +35,6 @@ function test_write() for (colnum, col_chunk) in enumerate(col_chunks) - println(colnum) correct_vals = tbl[colnum] coltype = eltype(correct_vals) vals_from_file = values(pf, col_chunk) From 275e7f20d41f4d1fc17bb87881b1ad24e2d80194 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Fri, 22 May 2020 13:29:30 +1000 Subject: [PATCH 35/52] added little endian writes --- src/writer.jl | 20 ++++++++++---------- test/test_writer.jl | 3 ++- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/writer.jl b/src/writer.jl index afaadac..260c976 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -70,7 +70,7 @@ function compress_using_codec(colvals::AbstractVector{String}, codec::Int)::Vect for val in colvals # for string it needs to be stored as BYTE_ARRAY which needs the length # to be the first 4 bytes UInt32 - write(io, val |> sizeof |> UInt32) + write(io, val |> sizeof |> UInt32 |> htol) # write each of the strings one after another write(io, val) end @@ -97,7 +97,7 @@ function write_defn_levels(data_to_compress_io, colvals::AbstractVector{Union{Mi encoded_defn_data_length = length(bitpacking_header) + bytes_needed # write the definition data - write(data_to_compress_io, UInt32(encoded_defn_data_length)) + write(data_to_compress_io, UInt32(encoded_defn_data_length) |> htol) write(data_to_compress_io, bitpacking_header) write(data_to_compress_io, encoded_defn_data) end @@ -112,7 +112,7 @@ function write_defn_levels(data_to_compress_io, colvals::AbstractVector) encoded_defn_data_length = sizeof(rle_header) + sizeof(repeated_value) # write the definition data - write(data_to_compress_io, UInt32(encoded_defn_data_length)) + write(data_to_compress_io, UInt32(encoded_defn_data_length) |> htol) write(data_to_compress_io, rle_header) write(data_to_compress_io, repeated_value) end @@ -178,7 +178,7 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{S for val in colvals # for string it needs to be stored as BYTE_ARRAY which needs the length # to be the first 4 bytes UInt32 - write(data_to_compress_io, val |> sizeof |> UInt32) + write(data_to_compress_io, val |> sizeof |> UInt32 |> htol) # write each of the strings one after another write(data_to_compress_io, val) end @@ -201,13 +201,13 @@ end function write_encoded_data(data_to_compress_io, colvals::AbstractArray) """ Efficient write of encoded data for `isbits` types""" @assert isbitstype(eltype(colvals)) - write(data_to_compress_io, colvals) + write(data_to_compress_io, colvals |> htol) end function write_encoded_data(data_to_compress_io, colvals::SkipMissing) """ Write of encoded data for skipped missing types""" for val in colvals - write(data_to_compress_io, val) + write(data_to_compress_io, val |> htol) end end @@ -216,7 +216,7 @@ function write_encoded_data(data_to_compress_io, colvals) The only requirement is that colvals has to be iterable """ for val in skipmissing(colvals) - write(data_to_compress_io, val) + write(data_to_compress_io, val |> htol) end end @@ -288,10 +288,10 @@ function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encodi rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1) repeated_value = UInt8(1) - encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value)) + encoded_defn_data_length = sizeof(rle_header) + sizeof(repeated_value) ## write the encoded data length - write(fileio, encoded_defn_data_length) + write(fileio, encoded_defn_data_length |> UInt32 |> htol) write(fileio, rle_header) write(fileio, repeated_value) @@ -593,7 +593,7 @@ function _write_parquet(itr_vectors, colnames, path, nchunks; ncols = length(itr filemetadata_size = write_thrift(fileio, filemetadata) - write(fileio, UInt32(filemetadata_size)) + write(fileio, UInt32(filemetadata_size) |> htol) write(fileio, "PAR1") close(fileio) end diff --git a/test/test_writer.jl b/test/test_writer.jl index dddafc1..5bb9ca8 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -35,6 +35,7 @@ function test_write() for (colnum, col_chunk) in enumerate(col_chunks) + println(colnum) correct_vals = tbl[colnum] coltype = eltype(correct_vals) vals_from_file = values(pf, col_chunk) @@ -55,4 +56,4 @@ function test_write() close(pf) end -# test_write() +test_write() From dda544cfeb9bff1347402adbfd6ea73fa70f6db4 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 23 May 2020 01:33:18 +1000 Subject: [PATCH 36/52] minor --- src/column_reader.jl | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index d91d20a..7a07a08 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -75,5 +75,3 @@ function read_column(path, filemetadata, col_num) end return res end - -() From 1930cc776a29d718237062c6902dd0f8a15fc8f5 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 23 May 2020 01:52:16 +1000 Subject: [PATCH 37/52] fixed test --- test/test_writer.jl | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/test/test_writer.jl b/test/test_writer.jl index 5bb9ca8..795252d 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -33,9 +33,18 @@ function test_write() # the file is very small so only one rowgroup col_chunks = columns(pf, 1) + colnum=12 + col_chunk=col_chunks[colnum] + + correct_vals = tbl[colnum] + coltype = eltype(correct_vals) + vals_from_file = values(pf, col_chunk) + + if Missing <: coltype + @test ismissing.(correct_vals) == (vals_from_file[2] .== 0) + end for (colnum, col_chunk) in enumerate(col_chunks) - println(colnum) correct_vals = tbl[colnum] coltype = eltype(correct_vals) vals_from_file = values(pf, col_chunk) @@ -43,10 +52,12 @@ function test_write() @test ismissing.(correct_vals) == (vals_from_file[2] .== 0) end + non_missing_vals = collect(skipmissing(correct_vals)) + if nonmissingtype(coltype) == String - @test all(skipmissing(correct_vals) .== String.(vals_from_file[1])) + non_missing_vals_read = String.(vals_from_file[1][1:sum(vals_from_file[2])]) + @test all(non_missing_vals .== non_missing_vals_read) else - non_missing_vals = collect(skipmissing(correct_vals)) non_missing_vals_read = vals_from_file[1][1:sum(vals_from_file[2])] @test all(non_missing_vals .== non_missing_vals_read) end From 58e79202d0db2d3e0f222ae733549ab1cd84f856 Mon Sep 17 00:00:00 2001 From: evalparse Date: Sat, 23 May 2020 02:00:22 +1000 Subject: [PATCH 38/52] Update src/Parquet.jl --- src/Parquet.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parquet.jl b/src/Parquet.jl index 421a0bb..18f0830 100644 --- a/src/Parquet.jl +++ b/src/Parquet.jl @@ -11,7 +11,7 @@ if VERSION < v"1.3" using Missings: nonmissingtype end -const PARQUET_JL_VERSION = v"0.5.1" +const PARQUET_JL_VERSION = v"0.5.3" import Base: show, open, close, values, eltype, length import Thrift: isfilled From 21d645fe97aaa28248bfd9c002396cc45a14fd77 Mon Sep 17 00:00:00 2001 From: evalparse Date: Sat, 23 May 2020 02:00:56 +1000 Subject: [PATCH 39/52] Update test/test_writer.jl --- test/test_writer.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_writer.jl b/test/test_writer.jl index 795252d..51718b1 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -33,7 +33,6 @@ function test_write() # the file is very small so only one rowgroup col_chunks = columns(pf, 1) - colnum=12 col_chunk=col_chunks[colnum] correct_vals = tbl[colnum] From 54c5f0ca1f3048ccf6f26a57339fe494bc307b61 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 23 May 2020 02:04:00 +1000 Subject: [PATCH 40/52] minor fix --- test/test_writer.jl | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/test/test_writer.jl b/test/test_writer.jl index 795252d..3887e80 100644 --- a/test/test_writer.jl +++ b/test/test_writer.jl @@ -33,17 +33,6 @@ function test_write() # the file is very small so only one rowgroup col_chunks = columns(pf, 1) - colnum=12 - col_chunk=col_chunks[colnum] - - correct_vals = tbl[colnum] - coltype = eltype(correct_vals) - vals_from_file = values(pf, col_chunk) - - if Missing <: coltype - @test ismissing.(correct_vals) == (vals_from_file[2] .== 0) - end - for (colnum, col_chunk) in enumerate(col_chunks) correct_vals = tbl[colnum] coltype = eltype(correct_vals) From 6a94305a40de667cb8979c66da2d593fbac1e15e Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 23 May 2020 12:17:52 +1000 Subject: [PATCH 41/52] minor: --- .gitignore | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitignore b/.gitignore index 0d9aedb..c4f35ef 100644 --- a/.gitignore +++ b/.gitignore @@ -4,9 +4,3 @@ parquet-compatibility/ julia-parquet-compatibility/ .vscode/settings.json -Manifest.toml -parquet.code-workspace -src/column_reader_rewrite.jl -src/column_reader_to_vals.jl -src/column_reader-dev.jl -src/read_parquet-test.jl From 7046f921f7e73b1ce1e566bad8395c5660e81c35 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 23 May 2020 12:26:43 +1000 Subject: [PATCH 42/52] so i dont lose it --- src/column_reader_dev.jl | 164 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 src/column_reader_dev.jl diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl new file mode 100644 index 0000000..92e55c5 --- /dev/null +++ b/src/column_reader_dev.jl @@ -0,0 +1,164 @@ + + +using Random: randstring +test_write1() = begin + tbl = ( + int32 = rand(Int32, 1000), + int64 = rand(Int64, 1000), + float32 = rand(Float32, 1000), + float64 = rand(Float64, 1000), + bool = rand(Bool, 1000), + string = [randstring(8) for i in 1:1000], + int32m = rand([missing, rand(Int32, 10)...], 1000), + int64m = rand([missing, rand(Int64, 10)...], 1000), + float32m = rand([missing, rand(Float32, 10)...], 1000), + float64m = rand([missing, rand(Float64, 10)...], 1000), + boolm = rand([missing, true, false], 1000), + stringm = rand([missing, "abc", "def", "ghi"], 1000) + ) + + write_parquet("c:/scratch/plsdel.parquet", tbl) +end + +test_write1() + +par = ParFile(path) + +T = TYPES[filemetadata.schema[col_num+1]._type+1] +# TODO detect if missing is necessary +res = Vector{Union{Missing, T}}(missing, nrows(par)) +write_cursor = 1 +for row_group in filemetadata.row_groups + pgs = pages(par, row_group.columns[col_num]) + + drop_page_count = 0 + # is the first page a dictionary page + # this is not the case for boolean values for example + if isfilled(pgs[1].hdr, :dictionary_page_header) + # the first page is almost always the dictionary page + dictionary_page = pgs[1] + drop_page_count = 1 + dictionary_of_values = T.(values(par, dictionary_page)[1]) + end + + # TODO deal with other types of pages e.g. dataheaderv2 + + # everything after the first data datapages + for data_page in Base.Iterators.drop(pgs, drop_page_count) + vals, definitions, decode = values(par, data_page) + + @assert all(in((0, 1)), definitions) + + l = sum(==(1), definitions) + # if all definitions values are 1 then it's not used + definitions_not_used = all(==(1), definitions) + + # data_page can be either + # * dictionary-encoded in which case we should look into the dictionary + # * plained-encoded in which case just return the values + page_encoding = Parquet.page_encoding(data_page) + + if page_encoding == Encoding.PLAIN_DICTIONARY + if definitions_not_used + res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1] + else + val_index = 1 + for (offset, definition) in enumerate(definitions) + if definition != 0 + value = vals[val_index] + res[write_cursor+offset-1] = dictionary_of_values[value + 1] + val_index += 1 + end + end + end + elseif page_encoding == Encoding.PLAIN + if definitions_not_used + res[write_cursor:write_cursor+l-1] .= T.(vals) + else + val_index = 1 + for (offset, definition) in enumerate(definitions) + if definition != 0 + value = vals[val_index] + res[write_cursor+offset-1] = T(value) + val_index += 1 + end + end + end + else + error("page encoding not supported yet") + end + + write_cursor += length(definitions) + end +end +return res +par = ParFile(path) + +T = TYPES[filemetadata.schema[col_num+1]._type+1] +# TODO detect if missing is necessary +res = Vector{Union{Missing, T}}(missing, nrows(par)) +write_cursor = 1 +for row_group in filemetadata.row_groups + pgs = pages(par, row_group.columns[col_num]) + + drop_page_count = 0 + # is the first page a dictionary page + # this is not the case for boolean values for example + if isfilled(pgs[1].hdr, :dictionary_page_header) + # the first page is almost always the dictionary page + dictionary_page = pgs[1] + drop_page_count = 1 + dictionary_of_values = T.(values(par, dictionary_page)[1]) + end + + # TODO deal with other types of pages e.g. dataheaderv2 + + # everything after the first data datapages + for data_page in Base.Iterators.drop(pgs, drop_page_count) + vals, definitions, decode = values(par, data_page) + + @assert all(in((0, 1)), definitions) + + l = sum(==(1), definitions) + # if all definitions values are 1 then it's not used + definitions_not_used = all(==(1), definitions) + + # data_page can be either + # * dictionary-encoded in which case we should look into the dictionary + # * plained-encoded in which case just return the values + page_encoding = Parquet.page_encoding(data_page) + + if page_encoding == Encoding.PLAIN_DICTIONARY + if definitions_not_used + res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1] + else + val_index = 1 + for (offset, definition) in enumerate(definitions) + if definition != 0 + value = vals[val_index] + res[write_cursor+offset-1] = dictionary_of_values[value + 1] + val_index += 1 + end + end + end + elseif page_encoding == Encoding.PLAIN + if definitions_not_used + res[write_cursor:write_cursor+l-1] .= T.(vals) + else + val_index = 1 + for (offset, definition) in enumerate(definitions) + if definition != 0 + value = vals[val_index] + res[write_cursor+offset-1] = T(value) + val_index += 1 + end + end + end + else + error("page encoding not supported yet") + end + + write_cursor += length(definitions) + end +end +return res From 2331e995c6fe1ae5c63d5f56ed5bff539f459ef8 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 25 May 2020 00:56:52 +1000 Subject: [PATCH 43/52] got a copy based reader working --- src/column_reader.jl | 273 ++++++++++++++++++++++++++++++--------- src/column_reader_dev.jl | 227 ++++++++++++-------------------- 2 files changed, 293 insertions(+), 207 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index 7a07a08..1589872 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -1,77 +1,230 @@ +import Base: iterate, length, IteratorSize, IteratorEltype, eltype const TYPES = (Bool, Int32, Int64, Int128, Float32, Float64, String, UInt8) -read_column(path, col_num) = read_column(path, metadata(path), col_num) +struct BitPackedIterator + data::Vector{UInt8} + bitwidth::Int32 +end + + +iterate(bp::BitPackedIterator) = iterate(bp::BitPackedIterator, 1) + +length(bp::BitPackedIterator) = div(8*length(bp.data), bp.bitwidth) + +IteratorSize(::Type{BitPackedIterator}) = Base.HasLength() +IteratorEltype(::Type{BitPackedIterator}) = Base.HasEltype() +eltype(::Type{BitPackedIterator}) = UInt + +function iterate(bp::BitPackedIterator, state) + end_bit = state * bp.bitwidth + end_byte = ceil(Int, end_bit / 8) + + if end_byte > length(bp.data) + return nothing + end -function read_column(path, filemetadata, col_num) + start_bit = (state - 1) * bp.bitwidth + 1 + + start_byte, bits_to_drop = divrem(start_bit-1, 8) + + start_byte += 1 + bits_to_drop = bits_to_drop + + # start bit shift the value + value = UInt(0) + + @inbounds for byte in @view bp.data[end_byte:-1:start_byte] + value = (value << 8) | byte + end + + value >>= bits_to_drop + + (value & UInt(2^bp.bitwidth-1), state + 1) +end + +function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UInt8} + if codec == PAR2.CompressionCodec.SNAPPY + uncompressed_data = Snappy.uncompress(compressed_data) + else + error("codedc $codec unsupported atm") + end +end + +zero_or_missing(::Type{String}) = missing +zero_or_missing(::Type{T}) where T = zero(T) + +function read_column(path, col_num) + filemetadata = Parquet.metadata(path) par = ParFile(path) + fileio = open(path) T = TYPES[filemetadata.schema[col_num+1]._type+1] + # TODO detect if missing is necessary - res = Vector{Union{Missing, T}}(missing, nrows(par)) - write_cursor = 1 + res = Vector{Union{Missing, T}}(undef, nrows(par)) + res .= zero_or_missing(T) + + length(filemetadata.row_groups) + + from = 1 + last_from = from for row_group in filemetadata.row_groups - pgs = pages(par, row_group.columns[col_num]) - - drop_page_count = 0 - # is the first page a dictionary page - # this is not the case for boolean values for example - if isfilled(pgs[1].hdr, :dictionary_page_header) - # the first page is almost always the dictionary page - dictionary_page = pgs[1] - drop_page_count = 1 - dictionary_of_values = T.(values(par, dictionary_page)[1]) + colchunk_meta = row_group.columns[col_num].meta_data + + if isfilled(colchunk_meta, :dictionary_page_offset) + seek(fileio, colchunk_meta.dictionary_page_offset) + dict_page_header = read_thrift(fileio, PAR2.PageHeader) + compressed_data = read(fileio, dict_page_header.compressed_page_size) + uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec) + @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size + + if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY + # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 + # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0 + dict = reinterpret(T, uncompressed_data) + else + error("Only Plain Dictionary encoding is supported") + end + else + dict = nothing end - # TODO deal with other types of pages e.g. dataheaderv2 - - # everything after the first data datapages - for data_page in Base.Iterators.drop(pgs, drop_page_count) - vals, definitions, decode = values(par, data_page) - - @assert all(in((0, 1)), definitions) - - l = sum(==(1), definitions) - # if all definitions values are 1 then it's not used - definitions_not_used = all(==(1), definitions) - - # data_page can be either - # * dictionary-encoded in which case we should look into the dictionary - # * plained-encoded in which case just return the values - page_encoding = Parquet.page_encoding(data_page) - - if page_encoding == Encoding.PLAIN_DICTIONARY - if definitions_not_used - res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1] - else - val_index = 1 - for (offset, definition) in enumerate(definitions) - if definition != 0 - value = vals[val_index] - res[write_cursor+offset-1] = dictionary_of_values[value + 1] - val_index += 1 - end - end - end - elseif page_encoding == Encoding.PLAIN - if definitions_not_used - res[write_cursor:write_cursor+l-1] .= T.(vals) - else - val_index = 1 - for (offset, definition) in enumerate(definitions) - if definition != 0 - value = vals[val_index] - res[write_cursor+offset-1] = T(value) - val_index += 1 - end - end + # seek to the first data page + seek(fileio, colchunk_meta.data_page_offset) + + # repeated read data page + while from - last_from < row_group.num_rows + from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1 + end + last_from = from + end + + res +end + +function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integer = 1) + """ + This function assumes + """ + + # the result length is used latter on to prevent writing too much data + res_len = length(res) + + to = from # intialise to something + + data_page_header = read_thrift(fileio, PAR2.PageHeader) + compressed_data = read(fileio, data_page_header.compressed_page_size) + uncompressed_data = decompress_with_codec(compressed_data, codec) + @assert length(uncompressed_data) == data_page_header.uncompressed_page_size + + # this is made up of these 3 things written back to back + # * repetition levels - can be ignored for unnested data + # * definition levels - + # * values + + # definition levels + # do_read_defn_lvls = isfilled(data_page_header.data_page_header, :statistics) && + # isfilled(data_page_header.data_page_header.statistics, :null_count) && + # data_page_header.data_page_header.statistics.null_count > 0 + uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false) + + if data_page_header.data_page_header.definition_level_encoding == PAR2.Encoding.RLE + # for unnested columns the highest possible value for definiton is 1 + # which can represented with just one bit so the bit width is always 1 + bitwidth = 1 + encoded_data_len = read(uncompressed_data_io, UInt32) + pos_before_encoded_data = position(uncompressed_data_io) + encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) + + if iseven(encoded_data_header) + # RLE encoded + rle_len = Int(encoded_data_header >> 1) + rle_val = read(uncompressed_data_io, 1) + pos_after_reading_encoded_data = position(uncompressed_data_io) + else + # bitpacked encoded + bit_pack_len = Int(encoded_data_header >> 1) + end + else + error("encoding not supported") + end + + @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len + + # this is how many values should have been read + num_values_check = data_page_header.data_page_header.num_values + + # valuess + if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN + # just return the data as is + # TODO would it better if take! is done? + + if T == Bool + # for boolean every bit is a value so the length is 8 times + digits(UInt8, read(uncompressed_data_io), base=2) + len_raw_data = 8length(raw_data) + else + pos_for_pointer = position(uncompressed_data_io) + 1 + src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer)) + dest_ptr = Ptr{T}(pointer(res, from)) + # copy content over + GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values_check) + to = min(from + num_values_check - 1, res_len) + + + # raw_data = reinterpret(T, read(uncompressed_data_io)) + # len_raw_data = length(raw_data) + # to = min(from + len_raw_data - 1, res_len) + #res[from:to] .= raw_data + end + elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY + # this means the data is encoded in integers format which form the indices to the data + bitwidth = Int(read(uncompressed_data_io, UInt8)) + + # the documented max bitwidth is + @assert bitwidth <= 32 + + while !eof(uncompressed_data_io) + # println(position(uncompressed_data_io)) + encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) + + if iseven(encoded_data_header) + # RLE encoded + rle_len = Int(encoded_data_header >> 1) + rle_val_vec::Vector{UInt8} = read(uncompressed_data_io, ceil(Int, bitwidth/8)) + rle_val = UInt(0) + + for tmp in @view rle_val_vec[end:-1:1] + rle_val = rle_val << 8 + rle_val = rle_val | tmp end + + to = min(from + rle_len - 1, res_len) + res[from:to] .= dict[rle_val+1] + + from = from + rle_len else - error("page encoding not supported yet") - end + # bitpacked encoded + bit_pack_len = Int(encoded_data_header >> 1) + @assert (bit_pack_len >= 1) && (bit_pack_len <= 2^31 - 1) + bytes_to_read = bitwidth*bit_pack_len + data = read(uncompressed_data_io, bytes_to_read) + bp = BitPackedIterator(data, bitwidth) + # now need a decoding algorithm to break it up + # reading `bitwidth` bits at a time + l = length(bp) + to = min(from + l - 1, res_len) - write_cursor += length(definitions) + for (v, i) in zip(bp, from:to) + res[i] = dict[v+1] + end + from = from + l + end end + else + erorr("encoding not supported") end - return res + + to end diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index 92e55c5..84c4ecd 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -1,164 +1,97 @@ +using Parquet +using Parquet:TYPES, read_thrift, PAR2, BitPackedIterator, decompress_with_codec +using Thrift: isfilled +using Snappy, CodecZlib, CodecZstd +path = "c:/git/parquet-data-collection/dsd50p.parquet" +path = "c:/data/Performance_2003Q3.txt.parquet" + +meta = Parquet.metadata(path); +par = ParFile(path); + +nrows(par) + +colnames(par) using Random: randstring -test_write1() = begin - tbl = ( - int32 = rand(Int32, 1000), - int64 = rand(Int64, 1000), - float32 = rand(Float32, 1000), - float64 = rand(Float64, 1000), - bool = rand(Bool, 1000), - string = [randstring(8) for i in 1:1000], - int32m = rand([missing, rand(Int32, 10)...], 1000), - int64m = rand([missing, rand(Int64, 10)...], 1000), - float32m = rand([missing, rand(Float32, 10)...], 1000), - float64m = rand([missing, rand(Float64, 10)...], 1000), - boolm = rand([missing, true, false], 1000), - stringm = rand([missing, "abc", "def", "ghi"], 1000) - ) - - write_parquet("c:/scratch/plsdel.parquet", tbl) +tbl = ( + int32 = rand(Int32, 1000), + int64 = rand(Int64, 1000), + float32 = rand(Float32, 1000), + float64 = rand(Float64, 1000), + bool = rand(Bool, 1000), + string = [randstring(8) for i in 1:1000], + int32m = rand([missing, rand(Int32, 10)...], 1000), + int64m = rand([missing, rand(Int64, 10)...], 1000), + float32m = rand([missing, rand(Float32, 10)...], 1000), + float64m = rand([missing, rand(Float64, 10)...], 1000), + boolm = rand([missing, true, false], 1000), + stringm = rand([missing, "abc", "def", "ghi"], 1000) +) + +tmpfile = tempname()*".parquet" + +write_parquet(tmpfile, tbl) + +path = tmpfile + +for i in 1:12 + @time col1 = Parquet.read_column(path, i); end -test_write1() +@time col1 = Parquet.read_column(path, 1) +col1 == tbl.int32 +col_num = 5 + +filemetadata = Parquet.metadata(path) par = ParFile(path) +fileio = open(path) T = TYPES[filemetadata.schema[col_num+1]._type+1] + # TODO detect if missing is necessary res = Vector{Union{Missing, T}}(missing, nrows(par)) -write_cursor = 1 -for row_group in filemetadata.row_groups - pgs = pages(par, row_group.columns[col_num]) - - drop_page_count = 0 - # is the first page a dictionary page - # this is not the case for boolean values for example - if isfilled(pgs[1].hdr, :dictionary_page_header) - # the first page is almost always the dictionary page - dictionary_page = pgs[1] - drop_page_count = 1 - dictionary_of_values = T.(values(par, dictionary_page)[1]) - end - # TODO deal with other types of pages e.g. dataheaderv2 - - # everything after the first data datapages - for data_page in Base.Iterators.drop(pgs, drop_page_count) - vals, definitions, decode = values(par, data_page) - - @assert all(in((0, 1)), definitions) - - l = sum(==(1), definitions) - # if all definitions values are 1 then it's not used - definitions_not_used = all(==(1), definitions) - - # data_page can be either - # * dictionary-encoded in which case we should look into the dictionary - # * plained-encoded in which case just return the values - page_encoding = Parquet.page_encoding(data_page) - - if page_encoding == Encoding.PLAIN_DICTIONARY - if definitions_not_used - res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1] - else - val_index = 1 - for (offset, definition) in enumerate(definitions) - if definition != 0 - value = vals[val_index] - res[write_cursor+offset-1] = dictionary_of_values[value + 1] - val_index += 1 - end - end - end - elseif page_encoding == Encoding.PLAIN - if definitions_not_used - res[write_cursor:write_cursor+l-1] .= T.(vals) - else - val_index = 1 - for (offset, definition) in enumerate(definitions) - if definition != 0 - value = vals[val_index] - res[write_cursor+offset-1] = T(value) - val_index += 1 - end - end - end - else - error("page encoding not supported yet") - end - - write_cursor += length(definitions) +length(filemetadata.row_groups) + +from = 1 +last_from = from + +row_group = filemetadata.row_groups[1] + +colchunk_meta = row_group.columns[col_num].meta_data + +if isfilled(colchunk_meta, :dictionary_page_offset) + seek(fileio, colchunk_meta.dictionary_page_offset) + dict_page_header = read_thrift(fileio, PAR2.PageHeader) + compressed_data = read(fileio, dict_page_header.compressed_page_size) + uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec) + @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size + + if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY + # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 + # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0 + dict = reinterpret(T, uncompressed_data) + else + error("Only Plain Dictionary encoding is supported") end +else + dict = nothing end -return res -par = ParFile(path) -T = TYPES[filemetadata.schema[col_num+1]._type+1] -# TODO detect if missing is necessary -res = Vector{Union{Missing, T}}(missing, nrows(par)) -write_cursor = 1 -for row_group in filemetadata.row_groups - pgs = pages(par, row_group.columns[col_num]) - - drop_page_count = 0 - # is the first page a dictionary page - # this is not the case for boolean values for example - if isfilled(pgs[1].hdr, :dictionary_page_header) - # the first page is almost always the dictionary page - dictionary_page = pgs[1] - drop_page_count = 1 - dictionary_of_values = T.(values(par, dictionary_page)[1]) - end +# seek to the first data page +seek(fileio, colchunk_meta.data_page_offset) - # TODO deal with other types of pages e.g. dataheaderv2 - - # everything after the first data datapages - for data_page in Base.Iterators.drop(pgs, drop_page_count) - vals, definitions, decode = values(par, data_page) - - @assert all(in((0, 1)), definitions) - - l = sum(==(1), definitions) - # if all definitions values are 1 then it's not used - definitions_not_used = all(==(1), definitions) - - # data_page can be either - # * dictionary-encoded in which case we should look into the dictionary - # * plained-encoded in which case just return the values - page_encoding = Parquet.page_encoding(data_page) - - if page_encoding == Encoding.PLAIN_DICTIONARY - if definitions_not_used - res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1] - else - val_index = 1 - for (offset, definition) in enumerate(definitions) - if definition != 0 - value = vals[val_index] - res[write_cursor+offset-1] = dictionary_of_values[value + 1] - val_index += 1 - end - end - end - elseif page_encoding == Encoding.PLAIN - if definitions_not_used - res[write_cursor:write_cursor+l-1] .= T.(vals) - else - val_index = 1 - for (offset, definition) in enumerate(definitions) - if definition != 0 - value = vals[val_index] - res[write_cursor+offset-1] = T(value) - val_index += 1 - end - end - end - else - error("page encoding not supported yet") - end - - write_cursor += length(definitions) - end +pg = read_thrift(fileio, PAR2.PageHeader) + +Parquet.read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + +# repeated read data page +while from - last_from < row_group.num_rows + from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1 end -return res +last_from = from + + +res From 87160411c2702d09d18c86b275a5e91e8c0fd65a Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Mon, 25 May 2020 01:25:56 +1000 Subject: [PATCH 44/52] minor copying memory is much faster --- src/column_reader.jl | 8 +------- src/column_reader_dev.jl | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index 1589872..ce64846 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -51,9 +51,6 @@ function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UI end end -zero_or_missing(::Type{String}) = missing -zero_or_missing(::Type{T}) where T = zero(T) - function read_column(path, col_num) filemetadata = Parquet.metadata(path) par = ParFile(path) @@ -63,9 +60,6 @@ function read_column(path, col_num) # TODO detect if missing is necessary res = Vector{Union{Missing, T}}(undef, nrows(par)) - res .= zero_or_missing(T) - - length(filemetadata.row_groups) from = 1 last_from = from @@ -176,7 +170,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # raw_data = reinterpret(T, read(uncompressed_data_io)) # len_raw_data = length(raw_data) # to = min(from + len_raw_data - 1, res_len) - #res[from:to] .= raw_data + # res[from:to] .= raw_data end elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY # this means the data is encoded in integers format which form the indices to the data diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index 84c4ecd..82c05a1 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -42,6 +42,28 @@ end @time col1 = Parquet.read_column(path, 1) col1 == tbl.int32 +using BenchmarkTools + +@benchmark Parquet.read_column($path, 1) + + + + + + + + + +@benchmark Parquet.read_column($path, 1) + + + + + + + + + col_num = 5 filemetadata = Parquet.metadata(path) From 327c66ef605a6a2a34cae963a18ac5779d11192c Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Tue, 26 May 2020 23:29:21 +1000 Subject: [PATCH 45/52] fixed most of the non dictionary value reads --- Project.toml | 2 + src/column_reader.jl | 187 +++++++++++++++++++++++++++++++-------- src/column_reader_dev.jl | 106 +++++++--------------- 3 files changed, 188 insertions(+), 107 deletions(-) diff --git a/Project.toml b/Project.toml index 0f1ce6a..86dc08f 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,8 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +NamedTupleTools = "d9ec5142-1e00-5aa0-9d6a-321866360f50" +ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22" diff --git a/src/column_reader.jl b/src/column_reader.jl index 1773192..77b6b03 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -59,10 +59,18 @@ function read_column(path, col_num) T = TYPES[filemetadata.schema[col_num+1]._type+1] # TODO detect if missing is necessary - res = Vector{Union{Missing, T}}(undef, nrows(par)) + if T == String + # the memory structure of String is different to other supported types + # so it's better to initialise it with missing + res = Vector{Union{Missing, String}}(missing, nrows(par)) + else + res = Vector{Union{Missing, T}}(undef, nrows(par)) + end from = 1 last_from = from + + j = 1 for row_group in filemetadata.row_groups colchunk_meta = row_group.columns[col_num].meta_data @@ -76,7 +84,18 @@ function read_column(path, col_num) if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0 - dict = reinterpret(T, uncompressed_data) + if T == String + dict = Vector{String}(undef, dict_page_header.dictionary_page_header.num_values) + uncompressed_data_io = IOBuffer(uncompressed_data) + j = 1 + while !eof(uncompressed_data_io) + str_len = read(uncompressed_data_io, UInt32) + dict[j] = String(read(uncompressed_data_io, str_len)) + j += 1 + end + else + dict = reinterpret(T, uncompressed_data) + end else error("Only Plain Dictionary encoding is supported") end @@ -88,10 +107,15 @@ function read_column(path, col_num) seek(fileio, colchunk_meta.data_page_offset) # repeated read data page - while from - last_from < row_group.num_rows + + while (from - last_from < row_group.num_rows) & (from <= length(res)) from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1 end last_from = from + + # (j == 1) && return res + j += 1 + end res @@ -99,7 +123,7 @@ end function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integer = 1) """ - This function assumes + Read one data page """ # the result length is used latter on to prevent writing too much data @@ -119,6 +143,12 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false) + # this will be set in future + has_missing = false + + # the number of values stored in this page + num_values = data_page_header.data_page_header.num_values + # definition levels # do_read_defn_lvls = isfilled(data_page_header.data_page_header, :statistics) && # isfilled(data_page_header.data_page_header.statistics, :null_count) && @@ -131,12 +161,28 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ pos_before_encoded_data = position(uncompressed_data_io) encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) + # TODO it's possible to be mixing RLE and bitpacked in one algorithm if iseven(encoded_data_header) # RLE encoded rle_len = Int(encoded_data_header >> 1) - rle_val = read(uncompressed_data_io, 1) + rle_val = read(uncompressed_data_io, UInt8) + pos_after_reading_encoded_data = position(uncompressed_data_io) + + if T == String + # strings memoery are stored differently so can't benefit from this + else + # fill the memory location with all missing + GC.@preserve res begin + dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1 + tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, num_values) + fill!(tmparray, rle_val) + end + end else + # the only reaosn to use bitpacking is because there are missings + has_missing = true + # bitpacked encoded bit_pack_len = Int(encoded_data_header >> 1) @@ -146,51 +192,122 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ pos_after_reading_encoded_data = position(uncompressed_data_io) # the structure of Vector{Union{T, Missing}} is - # * the T values first + # * the `values::T` first # * the missing are stored with UInt8(0) for missing # * and UInt8(1) otherwise # see https://docs.julialang.org/en/v1/devdocs/isbitsunionarrays/ - missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect - src_ptr = Ptr{UInt8}(pointer(missing_bytes)) - dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + # TODO I suspect this is not the fastest way to unpack bitwidth = 1 + # data + @assert bitwidth == 1 + bp = BitPackedIterator(data, bitwidth) + + missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect - # copy content over - GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, res_len) + if T == String + # do nothing + else + GC.@preserve missing_bytes res begin + src_ptr = Ptr{UInt8}(pointer(missing_bytes)) + dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1 + # copy content over + unsafe_copyto!(dest_ptr, src_ptr, res_len) + end + end end else - error("encoding not supported") + error("no definition encoding not supported") end + # this line ensures that we have read all the encoded definition data @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len - # this is how many values should have been read - num_values_check = data_page_header.data_page_header.num_values - - # valuess + # read values if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN # just return the data as is - # TODO would it better if take! is done? - if T == Bool - # for boolean every bit is a value so the length is 8 times - digits(UInt8, read(uncompressed_data_io), base=2) - len_raw_data = 8length(raw_data) + to = min(from + num_values - 1, res_len) + + if has_missing + upto = 1 + raw_data = Vector{Bool}(undef, 8) + for (i, missing_byte) in zip(from:to, missing_bytes) + if missing_byte == 1 + if upto == 1 + digits!(raw_data, read(uncompressed_data_io, UInt8), base=2) + end + res[i] = raw_data[upto] + upto += 1 + if upto == 9 + upto = 1 + end + end + end + else + # for boolean every bit is a value so the length is 8 times + i = from + while !eof(uncompressed_data_io) + udi = read(uncompressed_data_io, UInt8) + raw_data = Base.unsafe_wrap(Vector{Bool}, pointer(res, i) |> Ptr{Bool}, (8,)) + digits!(raw_data, udi, base=2) + + if i + 8 - 1 <= res_len + digits!(raw_data, udi, base=2) + i += 8 + else + for rd in digits(Bool, udi, base=2, pad = 8) + if i <= res_len + res[i] = rd + end + i += 1 + end + end + end + end + elseif T == String + to = min(from + num_values - 1, res_len) + if has_missing + for (i, missing_byte) in zip(from:to, missing_bytes) + if missing_byte == 1 + # 1 means not missing + str_len = read(uncompressed_data_io, UInt32) + res[i] = String(read(uncompressed_data_io, str_len)) + end + end + else + i = from + while !eof(uncompressed_data_io) + str_len = read(uncompressed_data_io, UInt32) + res[i] = String(read(uncompressed_data_io, str_len)) + i = i + 1 + end + end + else - # the copying approach is alot faster than the commented out - # assignment approach - pos_for_pointer = position(uncompressed_data_io) + 1 - src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer)) - dest_ptr = Ptr{T}(pointer(res, from)) - # copy content over - GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values_check) - to = min(from + num_values_check - 1, res_len) - - - # raw_data = reinterpret(T, read(uncompressed_data_io)) - # len_raw_data = length(raw_data) - # to = min(from + len_raw_data - 1, res_len) - # @inbounds res[from:to] .= raw_data + if has_missing + raw_data = reinterpret(T, read(uncompressed_data_io)) + to = min(from + num_values - 1, res_len) + + j = 1 + for (i, missing_byte) in zip(from:to, missing_bytes) + if missing_byte == 1 + # 1 means not missing + res[i] = raw_data[j] + j += 1 + end + end + else + # if there is no missing, can just copy the data into the + # right memory location + # the copying approach is alot faster than the commented out + # assignment approach + pos_for_pointer = position(uncompressed_data_io) + 1 + src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer)) + dest_ptr = Ptr{T}(pointer(res, from)) + # copy content over + GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values) + to = min(from + num_values - 1, res_len) + end end elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY # this means the data is encoded in integers format which form the indices to the data diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index 7ad8215..361e133 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -13,6 +13,8 @@ nrows(par) colnames(par) +@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par))); + using Random: randstring tbl = ( int32 = rand(Int32, 1000), @@ -27,94 +29,54 @@ tbl = ( float64m = rand([missing, rand(Float64, 10)...], 1000), boolm = rand([missing, true, false], 1000), stringm = rand([missing, "abc", "def", "ghi"], 1000) -) +); tmpfile = tempname()*".parquet" -write_parquet(tmpfile, tbl) +write_parquet(tmpfile, tbl); path = tmpfile -@time col1 = Parquet.read_column(path, 4) - -for i in 1:12 - @time col1 = Parquet.read_column(path, i); -end - -function read_filep(path, n) - collect(Parquet.read_column(path, i) for i in 1:n) +col_num = 3 +@time col1 = Parquet.read_column(path, col_num); +col1 +correct = getproperty(tbl, keys(tbl)[col_num]) +all(ismissing.(col1) .== ismissing.(correct)) +all(skipmissing(col1) .== skipmissing(correct)) + +using Test +checkcol(col_num) = begin + println(col_num) + @time col1 = Parquet.read_column(path, col_num); + # correct = getproperty(tbl, keys(tbl)[col_num]) + # @test all(ismissing.(col1) .== ismissing.(correct)) + # @test all(skipmissing(col1) .== skipmissing(correct)) end -@time a = read_filep(path, 4); - - - - - - - - - -@benchmark Parquet.read_column($path, 1) - - +@time checkcol.(1:31) - - - - -col_num = 5 - -filemetadata = Parquet.metadata(path) -par = ParFile(path) -fileio = open(path) - -T = TYPES[filemetadata.schema[col_num+1]._type+1] - -# TODO detect if missing is necessary -res = Vector{Union{Missing, T}}(missing, nrows(par)) - -length(filemetadata.row_groups) - -from = 1 -last_from = from - -row_group = filemetadata.row_groups[1] - -colchunk_meta = row_group.columns[col_num].meta_data - -if isfilled(colchunk_meta, :dictionary_page_offset) - seek(fileio, colchunk_meta.dictionary_page_offset) - dict_page_header = read_thrift(fileio, PAR2.PageHeader) - compressed_data = read(fileio, dict_page_header.compressed_page_size) - uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec) - @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size - - if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY - # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8 - # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0 - dict = reinterpret(T, uncompressed_data) - else - error("Only Plain Dictionary encoding is supported") +using Base.Threads: @spawn +read1(path, n) = begin + result = Vector{Any}(undef, length(n)) + for i in n + result[i] = @spawn Parquet.read_column(path, i) end -else - dict = nothing + fetch.(result) end -# seek to the first data page -seek(fileio, colchunk_meta.data_page_offset) +@time a = read1(path, 1:5) -pg = read_thrift(fileio, PAR2.PageHeader) +using DataFrames -Parquet.read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) +@time ba=DataFrame(a, copycols=false) +@time ba=DataFrame(a) + +b1 -# repeated read data page -while from - last_from < row_group.num_rows - from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1 -end -last_from = from +import Base: add_int +@edit Base.add_int(100, 1) -res +add_int From 02836c2b0dd983bb4fcf4f6f54a4bb6e1d418086 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 27 May 2020 15:36:33 +1000 Subject: [PATCH 46/52] more updates --- src/column_reader.jl | 154 +++++++++++++++++++++++---------------- src/column_reader_dev.jl | 25 +++++-- src/metadata.jl | 4 + 3 files changed, 116 insertions(+), 67 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index 77b6b03..743d42a 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -51,13 +51,15 @@ function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UI end end -function read_column(path, col_num) +read_column(path, col_num) = begin filemetadata = Parquet.metadata(path) - par = ParFile(path) - fileio = open(path) + read_column(path, filemetadata, col_num) +end +function read_column(path, filemetadata, col_num) T = TYPES[filemetadata.schema[col_num+1]._type+1] + par = ParFile(path) # TODO detect if missing is necessary if T == String # the memory structure of String is different to other supported types @@ -66,6 +68,9 @@ function read_column(path, col_num) else res = Vector{Union{Missing, T}}(undef, nrows(par)) end + close(par) + + fileio = open(path) from = 1 last_from = from @@ -109,7 +114,13 @@ function read_column(path, col_num) # repeated read data page while (from - last_from < row_group.num_rows) & (from <= length(res)) - from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1 + from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + + if from isa Tuple + return from + else + from += 1 + end end last_from = from @@ -159,60 +170,83 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ bitwidth = 1 encoded_data_len = read(uncompressed_data_io, UInt32) pos_before_encoded_data = position(uncompressed_data_io) - encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) - # TODO it's possible to be mixing RLE and bitpacked in one algorithm - if iseven(encoded_data_header) - # RLE encoded - rle_len = Int(encoded_data_header >> 1) - rle_val = read(uncompressed_data_io, UInt8) + from_defn = from - pos_after_reading_encoded_data = position(uncompressed_data_io) + pos_after_reading_encoded_data = pos_before_encoded_data - if T == String - # strings memoery are stored differently so can't benefit from this - else - # fill the memory location with all missing - GC.@preserve res begin - dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1 - tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, num_values) - fill!(tmparray, rle_val) + # initialise it to something + missing_bytes = UInt8[] + + while (pos_after_reading_encoded_data - pos_before_encoded_data) < encoded_data_len + encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) + + # TODO it's possible to be mixing RLE and bitpacked in one algorithm + if iseven(encoded_data_header) + # RLE encoded + rle_len = Int(encoded_data_header >> 1) + rle_val = read(uncompressed_data_io, UInt8) + + pos_after_reading_encoded_data = position(uncompressed_data_io) + + if T == String + # strings memoery are stored differently so can't benefit from this + else + # fill the memory location with all missing + GC.@preserve res begin + dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 + tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, rle_len) + fill!(tmparray, rle_val) + end end - end - else - # the only reaosn to use bitpacking is because there are missings - has_missing = true - # bitpacked encoded - bit_pack_len = Int(encoded_data_header >> 1) + from_defn = min(from_defn + rle_len - 1, res_len) + else + # the only reaosn to use bitpacking is because there are missings + has_missing = true - bytes_to_read = bitwidth*bit_pack_len - data = read(uncompressed_data_io, bytes_to_read) + # bitpacked encoded + bit_pack_len = Int(encoded_data_header >> 1) - pos_after_reading_encoded_data = position(uncompressed_data_io) + bytes_to_read = bitwidth*bit_pack_len + data = read(uncompressed_data_io, bytes_to_read) - # the structure of Vector{Union{T, Missing}} is - # * the `values::T` first - # * the missing are stored with UInt8(0) for missing - # * and UInt8(1) otherwise - # see https://docs.julialang.org/en/v1/devdocs/isbitsunionarrays/ + pos_after_reading_encoded_data = position(uncompressed_data_io) - # TODO I suspect this is not the fastest way to unpack bitwidth = 1 - # data - @assert bitwidth == 1 - bp = BitPackedIterator(data, bitwidth) + # the structure of Vector{Union{T, Missing}} is + # * the `values::T` first + # * the missing are stored with UInt8(0) for missing + # * and UInt8(1) otherwise + # see https://docs.julialang.org/en/v1/devdocs/isbitsunionarrays/ - missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect + # TODO I suspect this is not the fastest way to unpack bitwidth = 1 + # data + @assert bitwidth == 1 + bp = BitPackedIterator(data, bitwidth) - if T == String - # do nothing - else - GC.@preserve missing_bytes res begin - src_ptr = Ptr{UInt8}(pointer(missing_bytes)) - dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1 - # copy content over - unsafe_copyto!(dest_ptr, src_ptr, res_len) + missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect + + if T == String + # do nothing + else + GC.@preserve missing_bytes res begin + if from_defn + length(missing_bytes) - 1 <= res_len + # if not too long then can straight copy + src_ptr = Ptr{UInt8}(pointer(missing_bytes)) + dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 + # copy content over + unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes)) + else + missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(missing_bytes), res_len - from_defn + 1) + src_ptr = Ptr{UInt8}(pointer(missing_bytes_smaller)) + dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 + # copy content over + unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes_smaller)) + end + end end + + from_defn = min(from_defn + length(missing_bytes) - 1, res_len) end end else @@ -222,12 +256,14 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # this line ensures that we have read all the encoded definition data @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len + # read values + to = from + num_values - 1 + @assert to <= res_len + if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN # just return the data as is if T == Bool - to = min(from + num_values - 1, res_len) - if has_missing upto = 1 raw_data = Vector{Bool}(undef, 8) @@ -265,7 +301,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ end end elseif T == String - to = min(from + num_values - 1, res_len) if has_missing for (i, missing_byte) in zip(from:to, missing_bytes) if missing_byte == 1 @@ -286,8 +321,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ else if has_missing raw_data = reinterpret(T, read(uncompressed_data_io)) - to = min(from + num_values - 1, res_len) - j = 1 for (i, missing_byte) in zip(from:to, missing_bytes) if missing_byte == 1 @@ -302,11 +335,12 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # the copying approach is alot faster than the commented out # assignment approach pos_for_pointer = position(uncompressed_data_io) + 1 - src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer)) - dest_ptr = Ptr{T}(pointer(res, from)) - # copy content over - GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values) - to = min(from + num_values - 1, res_len) + GC.@preserve uncompressed_data res begin + src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer)) + dest_ptr = Ptr{T}(pointer(res, from)) + # copy content over + unsafe_copyto!(dest_ptr, src_ptr, num_values) + end end end elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY @@ -317,7 +351,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ @assert bitwidth <= 32 while !eof(uncompressed_data_io) - # println(position(uncompressed_data_io)) encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) if iseven(encoded_data_header) @@ -331,8 +364,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ rle_val = rle_val | tmp end - to = min(from + rle_len - 1, res_len) - res[from:to] .= dict[rle_val+1] + res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1] from = from + rle_len else @@ -345,9 +377,8 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # now need a decoding algorithm to break it up # reading `bitwidth` bits at a time l = length(bp) - to = min(from + l - 1, res_len) - for (v, i) in zip(bp, from:to) + for (v, i) in zip(bp, from:min(from + l - 1, to)) res[i] = dict[v+1] end from = from + l @@ -356,6 +387,5 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ else erorr("encoding not supported") end - to end diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index 361e133..2e1d305 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -12,8 +12,9 @@ par = ParFile(path); nrows(par) colnames(par) +close(par) -@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par))); +#@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par))); using Random: randstring tbl = ( @@ -35,25 +36,39 @@ tmpfile = tempname()*".parquet" write_parquet(tmpfile, tbl); +@time adf = read_parquet(tmpfile); + + path = tmpfile -col_num = 3 +col_num = 5 @time col1 = Parquet.read_column(path, col_num); +col1 + +uncompressed_data_io = col1[1] + +encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) + +using Debugger + +filemetadata = Parquet.metadata(path); +Debugger.@enter Parquet.read_column(path, filemetadata, col_num); + col1 correct = getproperty(tbl, keys(tbl)[col_num]) all(ismissing.(col1) .== ismissing.(correct)) all(skipmissing(col1) .== skipmissing(correct)) using Test -checkcol(col_num) = begin +checkcol(path, col_num) = begin println(col_num) - @time col1 = Parquet.read_column(path, col_num); + @elapsed col1 = Parquet.read_column(path, col_num); # correct = getproperty(tbl, keys(tbl)[col_num]) # @test all(ismissing.(col1) .== ismissing.(correct)) # @test all(skipmissing(col1) .== skipmissing(correct)) end -@time checkcol.(1:31) +@time checkcol.(path, 1:31) diff --git a/src/metadata.jl b/src/metadata.jl index 1c8c5af..dad19d8 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -10,4 +10,8 @@ function metadata(path) datasize = sz - meta_len - 2SZ_PAR_MAGIC - SZ_FOOTER seek(io, SZ_PAR_MAGIC + datasize) filemetadata = read_thrift(io, PAR2.FileMetaData) + + close(io) + + filemetadata end From 070988eba96e1c67ea917d3f3cde9b4709506e22 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 27 May 2020 17:23:09 +1000 Subject: [PATCH 47/52] fixed all bugs --- src/column_reader.jl | 99 ++++++++++++++++++++++++++++++++-------- src/column_reader_dev.jl | 37 +++++++++++---- 2 files changed, 108 insertions(+), 28 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index 743d42a..2ce5fb7 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -140,8 +140,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # the result length is used latter on to prevent writing too much data res_len = length(res) - to = from # intialise to something - data_page_header = read_thrift(fileio, PAR2.PageHeader) compressed_data = read(fileio, data_page_header.compressed_page_size) uncompressed_data = decompress_with_codec(compressed_data, codec) @@ -200,7 +198,11 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ end end - from_defn = min(from_defn + rle_len - 1, res_len) + append!(missing_bytes, fill(rle_val, rle_len)) + + from_defn += rle_len + @assert from_defn - from == length(missing_bytes) + @assert length(missing_bytes) <= num_values else # the only reaosn to use bitpacking is because there are missings has_missing = true @@ -224,29 +226,58 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ @assert bitwidth == 1 bp = BitPackedIterator(data, bitwidth) - missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect + tmp_missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect + + len_of_tmp_missing_bytes = length(tmp_missing_bytes) + @assert mod(len_of_tmp_missing_bytes, 8) == 0 + + # the tmp_missing_bytes is always in a multiple of 8 so need to + # be careful not to write too much + last_from_defn = from_defn + + # compute the new from_defn + from_defn = min(from_defn + len_of_tmp_missing_bytes, from + num_values) + + len_to_write = from_defn - last_from_defn if T == String # do nothing else - GC.@preserve missing_bytes res begin - if from_defn + length(missing_bytes) - 1 <= res_len + GC.@preserve tmp_missing_bytes res begin + if len_to_write == len_of_tmp_missing_bytes + + append!(missing_bytes, tmp_missing_bytes) + + # @assert from_defn-from == length(missing_bytes) + + if length(missing_bytes) > num_values + println(tmp_missing_bytes) + println("$last_from_defn $from_defn $(from+num_values) $len_to_write $len_of_tmp_missing_bytes") + end + # @assert length(missing_bytes) <= num_values # if not too long then can straight copy - src_ptr = Ptr{UInt8}(pointer(missing_bytes)) + src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes)) dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 # copy content over - unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes)) - else - missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(missing_bytes), res_len - from_defn + 1) - src_ptr = Ptr{UInt8}(pointer(missing_bytes_smaller)) + unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes)) + elseif len_to_write < len_of_tmp_missing_bytes + tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write) + # @assert length(tmp_missing_bytes_smaller) == len_to_write + append!(missing_bytes, tmp_missing_bytes_smaller) + # @assert from_defn - from == length(missing_bytes) + # @assert length(missing_bytes) == num_values + + src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes_smaller)) dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 # copy content over - unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes_smaller)) + unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes_smaller)) + else + error("something is wrong") end end end - - from_defn = min(from_defn + length(missing_bytes) - 1, res_len) + # @assert from_defn-from == length(missing_bytes) + # @assert length(missing_bytes) <= num_values end end else @@ -256,12 +287,17 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # this line ensures that we have read all the encoded definition data @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len + if has_missing + @assert length(missing_bytes) == num_values + end + # read values to = from + num_values - 1 @assert to <= res_len if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN + # println("meh") # just return the data as is if T == Bool if has_missing @@ -364,7 +400,16 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ rle_val = rle_val | tmp end - res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1] + if has_missing + index = from:min(to, from + rle_len - 1) + for (i, missing_byte) in zip(index, missing_bytes) + if missing_byte == 1 + res[i] = dict[rle_val+1] + end + end + else + res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1] + end from = from + rle_len else @@ -373,19 +418,35 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ @assert (bit_pack_len >= 1) && (bit_pack_len <= 2^31 - 1) bytes_to_read = bitwidth*bit_pack_len data = read(uncompressed_data_io, bytes_to_read) - bp = BitPackedIterator(data, bitwidth) + # TODO remove the collect here + bp = BitPackedIterator(data, bitwidth) |> collect # now need a decoding algorithm to break it up # reading `bitwidth` bits at a time l = length(bp) - for (v, i) in zip(bp, from:min(from + l - 1, to)) - res[i] = dict[v+1] + index = from:min(from + l - 1, to) + + if has_missing + j = 1 + for (i, missing_byte) in zip(index, missing_bytes) + if missing_byte == 1 + res[i] = dict[bp[j]+1] + j += 1 + end + end + else + for (i, v) in zip(index, bp) + res[i] = dict[v+1] + end end + + from = from + l end end else erorr("encoding not supported") end - to + + return to end diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index 2e1d305..f1d8dee 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -6,6 +6,10 @@ using Snappy, CodecZlib, CodecZstd path = "c:/git/parquet-data-collection/dsd50p.parquet" path = "c:/data/Performance_2003Q3.txt.parquet" +col_num = 1 +@time col1 = Parquet.read_column(path, col_num); +col1 + meta = Parquet.metadata(path); par = ParFile(path); @@ -41,10 +45,14 @@ write_parquet(tmpfile, tbl); path = tmpfile -col_num = 5 -@time col1 = Parquet.read_column(path, col_num); + + col1 +col1[19:20] + +last(col1) + uncompressed_data_io = col1[1] encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) @@ -60,15 +68,26 @@ all(ismissing.(col1) .== ismissing.(correct)) all(skipmissing(col1) .== skipmissing(correct)) using Test -checkcol(path, col_num) = begin - println(col_num) - @elapsed col1 = Parquet.read_column(path, col_num); - # correct = getproperty(tbl, keys(tbl)[col_num]) - # @test all(ismissing.(col1) .== ismissing.(correct)) - # @test all(skipmissing(col1) .== skipmissing(correct)) +using Base.Threads: @spawn + +checkcol(path, n; multithreaded=true) = begin + res = Vector{Any}(undef, n) + if multithreaded + for col_num in 1:n + res[col_num] = @spawn Parquet.read_column(path, col_num); + end + return fetch.(res) + else + for col_num in 1:n + println(col_num) + res[col_num] = Parquet.read_column(path, col_num); + end + return res + end end -@time checkcol.(path, 1:31) +@time checkcol(path, 31, multithreaded=true); +@time checkcol(path, 31, multithreaded=false); From 08a961bd29920b75843ff1340164993c523c285a Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 27 May 2020 19:23:45 +1000 Subject: [PATCH 48/52] fixed memory bug --- src/column_reader.jl | 54 +++++++++++++++++++--------------------- src/column_reader_dev.jl | 52 +++++++++++++++++++++++--------------- 2 files changed, 57 insertions(+), 49 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index 2ce5fb7..1f67e05 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -233,51 +233,44 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # the tmp_missing_bytes is always in a multiple of 8 so need to # be careful not to write too much - last_from_defn = from_defn - # compute the new from_defn - from_defn = min(from_defn + len_of_tmp_missing_bytes, from + num_values) + new_from_defn = min(from_defn + len_of_tmp_missing_bytes, from + num_values) + + len_to_write = new_from_defn - from_defn - len_to_write = from_defn - last_from_defn + if len_to_write == len_of_tmp_missing_bytes + append!(missing_bytes, tmp_missing_bytes) + elseif len_to_write < len_of_tmp_missing_bytes + tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write) + append!(missing_bytes, tmp_missing_bytes_smaller) + else + error("something is wrong") + end if T == String # do nothing else - GC.@preserve tmp_missing_bytes res begin - if len_to_write == len_of_tmp_missing_bytes - - append!(missing_bytes, tmp_missing_bytes) - - # @assert from_defn-from == length(missing_bytes) - - if length(missing_bytes) > num_values - println(tmp_missing_bytes) - println("$last_from_defn $from_defn $(from+num_values) $len_to_write $len_of_tmp_missing_bytes") - end - # @assert length(missing_bytes) <= num_values + if len_to_write == len_of_tmp_missing_bytes + GC.@preserve tmp_missing_bytes res begin # if not too long then can straight copy src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes)) dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 # copy content over unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes)) - elseif len_to_write < len_of_tmp_missing_bytes - tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write) - # @assert length(tmp_missing_bytes_smaller) == len_to_write - append!(missing_bytes, tmp_missing_bytes_smaller) - # @assert from_defn - from == length(missing_bytes) - # @assert length(missing_bytes) == num_values - + end + elseif len_to_write < len_of_tmp_missing_bytes + GC.@preserve tmp_missing_bytes_smaller res begin src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes_smaller)) dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 # copy content over - unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes_smaller)) - else - error("something is wrong") + unsafe_copyto!(dest_ptr, src_ptr, len_to_write) end + else + error("something is wrong") end + end - # @assert from_defn-from == length(missing_bytes) - # @assert length(missing_bytes) <= num_values + from_defn = new_from_defn end end else @@ -320,7 +313,9 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ i = from while !eof(uncompressed_data_io) udi = read(uncompressed_data_io, UInt8) - raw_data = Base.unsafe_wrap(Vector{Bool}, pointer(res, i) |> Ptr{Bool}, (8,)) + GC.@preserve res begin + raw_data = Base.unsafe_wrap(Vector{Bool}, pointer(res, i) |> Ptr{Bool}, (8,)) + end digits!(raw_data, udi, base=2) if i + 8 - 1 <= res_len @@ -357,6 +352,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ else if has_missing raw_data = reinterpret(T, read(uncompressed_data_io)) + return raw_data, missing_bytes j = 1 for (i, missing_byte) in zip(from:to, missing_bytes) if missing_byte == 1 diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index f1d8dee..e42a673 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -3,23 +3,6 @@ using Parquet:TYPES, read_thrift, PAR2, BitPackedIterator, decompress_with_codec using Thrift: isfilled using Snappy, CodecZlib, CodecZstd -path = "c:/git/parquet-data-collection/dsd50p.parquet" -path = "c:/data/Performance_2003Q3.txt.parquet" - -col_num = 1 -@time col1 = Parquet.read_column(path, col_num); -col1 - -meta = Parquet.metadata(path); -par = ParFile(path); - -nrows(par) - -colnames(par) -close(par) - -#@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par))); - using Random: randstring tbl = ( int32 = rand(Int32, 1000), @@ -38,15 +21,42 @@ tbl = ( tmpfile = tempname()*".parquet" -write_parquet(tmpfile, tbl); +@time write_parquet(tmpfile, tbl); +path = tmpfile -@time adf = read_parquet(tmpfile); +col_num=12 +@time col1 = Parquet.read_column(path, col_num); +all(col1 .=== tbl.stringm) + + +using BenchmarkTools +@benchmark adf = read_parquet(path) -path = tmpfile + + + +path = "c:/git/parquet-data-collection/dsd50p.parquet" +path = "c:/data/Performance_2003Q3.txt.parquet" + +col_num = 1 +@time col1 = Parquet.read_column(path, col_num); +col1 + +meta = Parquet.metadata(path); +par = ParFile(path); + +nrows(par) + +colnames(par) +close(par) + +#@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par))); + + col1 col1[19:20] @@ -89,6 +99,8 @@ end @time checkcol(path, 31, multithreaded=true); @time checkcol(path, 31, multithreaded=false); +@time checkcol(path, 12, multithreaded=false); + using Base.Threads: @spawn From dc619e3904d10d54b3dc19f45e05410a36f7a35f Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 27 May 2020 19:39:28 +1000 Subject: [PATCH 49/52] fixed bug with parquet reader --- src/column_reader_dev.jl | 6 +- src/read_parquet.jl | 33 ++-- src/show.jl | 417 ++++++++++++++++++++------------------- 3 files changed, 232 insertions(+), 224 deletions(-) diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index e42a673..34e9a58 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -28,12 +28,13 @@ col_num=12 @time col1 = Parquet.read_column(path, col_num); all(col1 .=== tbl.stringm) +a = read_parquet(path) using BenchmarkTools @benchmark adf = read_parquet(path) - +adf @@ -42,6 +43,9 @@ using BenchmarkTools path = "c:/git/parquet-data-collection/dsd50p.parquet" path = "c:/data/Performance_2003Q3.txt.parquet" +@time adf = read_parquet(path); + + col_num = 1 @time col1 = Parquet.read_column(path, col_num); col1 diff --git a/src/read_parquet.jl b/src/read_parquet.jl index 4916a6c..de953a9 100644 --- a/src/read_parquet.jl +++ b/src/read_parquet.jl @@ -10,15 +10,12 @@ read_parquet(path; kwargs...) = read_parquet(path, String[]; kwargs...) function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose = false) """function for reading parquet""" - if multithreaded - # use a bounded channel to limit - c1 = Channel{Bool}(Threads.nthreads()) - atexit(()->close(c1)) - end + par = ParFile(path) + nc = ncols(par) - nc = ncols(ParFile(path)) + colnames = [sch.name for sch in drop(par.schema.schema, 1)] - colnames = [sch.name for sch in drop(ParFile(path).schema.schema, 1)] + close(par) if length(cols) == 0 colnums = collect(1:nc) @@ -31,13 +28,8 @@ function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose = filemetadata = metadata(path) if multithreaded - @showprogress for (i, j) in enumerate(colnums) - put!(c1, true) - results[i] = @spawn begin - res = read_column(path, filemetadata, j) - take!(c1) - res - end + for (i, j) in enumerate(colnums) + results[i] = @spawn read_column(path, filemetadata, j) end else @showprogress for (i, j) in enumerate(colnums) @@ -47,10 +39,11 @@ function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose = symbol_col_names = collect(Symbol(col) for col in colnames[colnums]) - if multithreaded - fnl_results = collect(fetch(result) for result in results) - return namedtuple(symbol_col_names, fnl_results) - else - return namedtuple(symbol_col_names, results) - end + if multithreaded + @showprogress for i in 1:length(results) + results[i] = fetch(results[i]) + end + end + + return namedtuple(symbol_col_names, results) end diff --git a/src/show.jl b/src/show.jl index 27ef44a..fcd17a9 100644 --- a/src/show.jl +++ b/src/show.jl @@ -1,203 +1,214 @@ -function print_indent(io, n) - for d in 1:n - print(io, " ") - end -end - -function show(io::IO, cursor::RecordCursor) - par = cursor.par - rows = cursor.colcursors[1].row.rows - println(io, "Record Cursor on $(par.path)") - println(io, " rows: $rows") - - colpaths = [join(colname, '.') for colname in cursor.colnames] - println(io, " cols: $(join(colpaths, ", "))") -end - -function show(io::IO, schema::SchemaElement, indent::AbstractString="", nchildren::Vector{Int}=Int[]) - print(io, indent) - lchildren = length(nchildren) - print_indent(io, lchildren) - if isfilled(schema, :repetition_type) - r = schema.repetition_type - print(io, (r == FieldRepetitionType.REQUIRED) ? "required" : (r == FieldRepetitionType.OPTIONAL) ? "optional" : "repeated", " "); - end - isfilled(schema, :_type) && print(io, Thrift.enumstr(_Type, schema._type), " ") - - print(io, schema.name) - isfilled(schema, :field_id) && print(io, " (", schema.field_id, ")") - - if isfilled(schema, :converted_type) - print(io, "# (from ", Thrift.enumstr(ConvertedType, schema.converted_type)) - if schema.converted_type == ConvertedType.DECIMAL - print(io, "(", schema.scale, ".", schema.precision) - end - print(") ") - end - - if isfilled(schema, :num_children) - push!(nchildren, schema.num_children) - print(io, " {") - elseif lchildren > 0 - nchildren[lchildren] -= 1 - if nchildren[lchildren] == 0 - pop!(nchildren) - println(io, "") - print_indent(io, length(nchildren)) - print(io, indent, "}") - end - end - - println(io, "") -end - -function show(io::IO, schema::Vector{SchemaElement}, indent::AbstractString="") - println(io, indent, "Schema:") - nchildren=Int[] - for schemaelem in schema - show(io, schemaelem, indent * " ", nchildren) - end -end - -show(io::IO, schema::Schema, indent::AbstractString="") = show(io, schema.schema, indent) - -function show(io::IO, kvmeta::KeyValue, indent::AbstractString="") - println(io, indent, kvmeta.key, " => ", kvmeta.value) -end - -function show(io::IO, kvmetas::Vector{KeyValue}, indent::AbstractString="") - isempty(kvmetas) && return - println(io, indent, "Metadata:") - for kvmeta in kvmetas - show(io, kvmeta, indent * " ") - end -end - -function show_encodings(io::IO, encodings::Vector{Int32}, indent::AbstractString="") - isempty(encodings) && return - print(io, indent, "Encodings: ") - pfx = "" - for encoding in encodings - print(io, pfx, Thrift.enumstr(Encoding, encoding)) - pfx = ", " - end - println(io, "") -end - -show(io::IO, hdr::IndexPageHeader, indent::AbstractString="") = nothing -function show(io::IO, page::DictionaryPageHeader, indent::AbstractString="") - println(io, indent, page.num_values, " values") -end - -function show(io::IO, hdr::DataPageHeader, indent::AbstractString="") - println(io, indent, hdr.num_values, " values") - println(io, indent, "encodings: values as ", Thrift.enumstr(Encoding, hdr.encoding), ", definitions as ", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetitions as ", Thrift.enumstr(Encoding, hdr.repetition_level_encoding)) - Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent) -end - -function show(io::IO, hdr::DataPageHeaderV2, indent::AbstractString="") - compressed = Thrift.isfilled(hdr, :is_compressed) ? hdr.is_compressed : true - println(io, indent, hdr.num_values, " values, ", hdr.num_nulls, " nulls, ", hdr.num_rows, " rows, compressed:", compressed) - println(io, indent, "encoding:", Thrift.enumstr(Encoding, hdr.encoding), ", definition:", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetition:", Thrift.enumstr(Encoding, hdr.repetition_level_encoding)) - Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent) -end - -function show(io::IO, page::PageHeader, indent::AbstractString="") - println(io, indent, Thrift.enumstr(PageType, page._type), " compressed bytes:", page.compressed_page_size, " (", page.uncompressed_page_size, " uncompressed)") - Thrift.isfilled(page, :data_page_header) && show(io, page.data_page_header, indent * " ") - Thrift.isfilled(page, :data_page_header_v2) && show(io, page.data_page_header_v2, indent * " ") - Thrift.isfilled(page, :index_page_header) && show(io, page.index_page_header, indent * " ") - Thrift.isfilled(page, :dictionary_page_header) && show(io, page.dictionary_page_header, indent * " ") -end - -function show(io::IO, pages::Vector{PageHeader}, indent::AbstractString="") - println(io, indent, "Pages:") - for page in pages - show(io, page, indent * " ") - end -end - -show(io::IO, page::Page, indent::AbstractString="") = show(io, page.hdr, indent) -show(io::IO, pages::Vector{Page}, indent::AbstractString="") = show(io, [page.hdr for page in pages], indent) - -function show(io::IO, stat::Statistics, indent::AbstractString="") - println(io, indent, "Statistics:") - if Thrift.isfilled(stat, :min) && Thrift.isfilled(stat, :max) - println(io, indent, " range:", stat.min, ":", stat.max) - elseif Thrift.isfilled(stat, :min) - println(io, indent, " min:", stat.min) - elseif Thrift.isfilled(stat, :max) - println(io, indent, " max:", stat.max) - end - Thrift.isfilled(stat, :null_count) && println(io, indent, " null count:", stat.null_count) - Thrift.isfilled(stat, :distinct_count) && println(io, indent, " distinct count:", stat.distinct_count) -end - -function show(io::IO, page_enc::PageEncodingStats, indent::AbstractString="") - println(io, indent, page_enc.count, " ", Thrift.enumstr(Encoding, page_enc.encoding), " encoded ", Thrift.enumstr(PageType, page_enc.page_type), " pages") -end - -function show(io::IO, page_encs::Vector{PageEncodingStats}, indent::AbstractString="") - isempty(page_encs) && return - println(io, indent, "Page encoding statistics:") - for page_enc in page_encs - show(io, page_enc, indent * " ") - end -end - -function show(io::IO, colmeta::ColumnMetaData, indent::AbstractString="") - println(io, indent, Thrift.enumstr(_Type, coltype(colmeta)), " ", join(colname(colmeta), '.'), ", num values:", colmeta.num_values) - show_encodings(io, colmeta.encodings, indent) - if colmeta.codec != CompressionCodec.UNCOMPRESSED - println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " compressed bytes:", colmeta.total_compressed_size, " (", colmeta.total_uncompressed_size, " uncompressed)") - else - println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " bytes:", colmeta.total_compressed_size) - end - - print(io, indent, "offsets: data:", colmeta.data_page_offset) - Thrift.isfilled(colmeta, :index_page_offset) && print(io, ", index:", colmeta.index_page_offset) - Thrift.isfilled(colmeta, :dictionary_page_offset) && print(io, ", dictionary:", colmeta.dictionary_page_offset) - println(io, "") - Thrift.isfilled(colmeta, :statistics) && show(io, colmeta.statistics, indent) - Thrift.isfilled(colmeta, :encoding_stats) && show(io, colmeta.encoding_stats, indent) - Thrift.isfilled(colmeta, :key_value_metadata) && show(io, colmeta.key_value_metadata, indent) -end - -function show(io::IO, columns::Vector{ColumnChunk}, indent::AbstractString="") - for col in columns - path = isfilled(col, :file_path) ? col.file_path : "" - println(io, indent, "Column at offset: ", path, "#", col.file_offset) - show(io, col.meta_data, indent * " ") - end -end - -function show(io::IO, grp::RowGroup, indent::AbstractString="") - println(io, indent, "Row Group: ", grp.num_rows, " rows in ", grp.total_byte_size, " bytes") - show(io, grp.columns, indent * " ") -end - -function show(io::IO, row_groups::Vector{RowGroup}, indent::AbstractString="") - println(io, indent, "Row Groups:") - for grp in row_groups - show(io, grp, indent * " ") - end -end - -function show(io::IO, meta::FileMetaData, indent::AbstractString="") - println(io, indent, "version: ", meta.version) - println(io, indent, "nrows: ", meta.num_rows) - println(io, indent, "created by: ", meta.created_by) - - show(io, meta.schema, indent) - show(io, meta.row_groups, indent) - Thrift.isfilled(meta, :key_value_metadata) && show(io, meta.key_value_metadata, indent) -end - -function show(io::IO, par::ParFile) - println(io, "Parquet file: $(par.path)") - meta = par.meta - println(io, " version: $(meta.version)") - println(io, " nrows: $(meta.num_rows)") - println(io, " created by: $(meta.created_by)") - println(io, " cached: $(length(par.page_cache.refs)) column chunks") -end +# function print_indent(io, n) +# for d in 1:n +# print(io, " ") +# end +# end +# +# function show(io::IO, cursor::RecordCursor) +# par = cursor.par +# rows = cursor.colcursors[1].row.rows +# println(io, "Record Cursor on $(par.path)") +# println(io, " rows: $rows") +# +# colpaths = [join(colname, '.') for colname in cursor.colnames] +# println(io, " cols: $(join(colpaths, ", "))") +# end +# +# function show(io::IO, cursor::BatchedColumnsCursor) +# par = cursor.par +# rows = cursor.colcursors[1].row.rows +# println(io, "Batched Columns Cursor on $(par.path)") +# println(io, " rows: $rows") +# println(io, " batches: $(length(cursor))") +# +# colpaths = [join(colname, '.') for colname in cursor.colnames] +# println(io, " cols: $(join(colpaths, ", "))") +# end +# +# function show(io::IO, schema::SchemaElement, indent::AbstractString="", nchildren::Vector{Int}=Int[]) +# print(io, indent) +# lchildren = length(nchildren) +# print_indent(io, lchildren) +# if isfilled(schema, :repetition_type) +# r = schema.repetition_type +# print(io, (r == FieldRepetitionType.REQUIRED) ? "required" : (r == FieldRepetitionType.OPTIONAL) ? "optional" : "repeated", " "); +# end +# isfilled(schema, :_type) && print(io, Thrift.enumstr(_Type, schema._type), " ") +# +# print(io, schema.name) +# isfilled(schema, :field_id) && print(io, " (", schema.field_id, ")") +# +# if isfilled(schema, :converted_type) +# print(io, "# (from ", Thrift.enumstr(ConvertedType, schema.converted_type)) +# if schema.converted_type == ConvertedType.DECIMAL +# print(io, "(", schema.scale, ".", schema.precision) +# end +# print(") ") +# end +# +# if isfilled(schema, :num_children) +# push!(nchildren, schema.num_children) +# print(io, " {") +# elseif lchildren > 0 +# nchildren[lchildren] -= 1 +# if nchildren[lchildren] == 0 +# pop!(nchildren) +# println(io, "") +# print_indent(io, length(nchildren)) +# print(io, indent, "}") +# end +# end +# +# println(io, "") +# end +# +# function show(io::IO, schema::Vector{SchemaElement}, indent::AbstractString="") +# println(io, indent, "Schema:") +# nchildren=Int[] +# for schemaelem in schema +# show(io, schemaelem, indent * " ", nchildren) +# end +# end +# +# show(io::IO, schema::Schema, indent::AbstractString="") = show(io, schema.schema, indent) +# +# function show(io::IO, kvmeta::KeyValue, indent::AbstractString="") +# println(io, indent, kvmeta.key, " => ", kvmeta.value) +# end +# +# function show(io::IO, kvmetas::Vector{KeyValue}, indent::AbstractString="") +# isempty(kvmetas) && return +# println(io, indent, "Metadata:") +# for kvmeta in kvmetas +# show(io, kvmeta, indent * " ") +# end +# end +# +# function show_encodings(io::IO, encodings::Vector{Int32}, indent::AbstractString="") +# isempty(encodings) && return +# print(io, indent, "Encodings: ") +# pfx = "" +# for encoding in encodings +# print(io, pfx, Thrift.enumstr(Encoding, encoding)) +# pfx = ", " +# end +# println(io, "") +# end +# +# show(io::IO, hdr::IndexPageHeader, indent::AbstractString="") = nothing +# function show(io::IO, page::DictionaryPageHeader, indent::AbstractString="") +# println(io, indent, page.num_values, " values") +# end +# +# function show(io::IO, hdr::DataPageHeader, indent::AbstractString="") +# println(io, indent, hdr.num_values, " values") +# println(io, indent, "encodings: values as ", Thrift.enumstr(Encoding, hdr.encoding), ", definitions as ", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetitions as ", Thrift.enumstr(Encoding, hdr.repetition_level_encoding)) +# Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent) +# end +# +# function show(io::IO, hdr::DataPageHeaderV2, indent::AbstractString="") +# compressed = Thrift.isfilled(hdr, :is_compressed) ? hdr.is_compressed : true +# println(io, indent, hdr.num_values, " values, ", hdr.num_nulls, " nulls, ", hdr.num_rows, " rows, compressed:", compressed) +# println(io, indent, "encoding:", Thrift.enumstr(Encoding, hdr.encoding), ", definition:", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetition:", Thrift.enumstr(Encoding, hdr.repetition_level_encoding)) +# Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent) +# end +# +# function show(io::IO, page::PageHeader, indent::AbstractString="") +# println(io, indent, Thrift.enumstr(PageType, page._type), " compressed bytes:", page.compressed_page_size, " (", page.uncompressed_page_size, " uncompressed)") +# Thrift.isfilled(page, :data_page_header) && show(io, page.data_page_header, indent * " ") +# Thrift.isfilled(page, :data_page_header_v2) && show(io, page.data_page_header_v2, indent * " ") +# Thrift.isfilled(page, :index_page_header) && show(io, page.index_page_header, indent * " ") +# Thrift.isfilled(page, :dictionary_page_header) && show(io, page.dictionary_page_header, indent * " ") +# end +# +# function show(io::IO, pages::Vector{PageHeader}, indent::AbstractString="") +# println(io, indent, "Pages:") +# for page in pages +# show(io, page, indent * " ") +# end +# end +# +# show(io::IO, page::Page, indent::AbstractString="") = show(io, page.hdr, indent) +# show(io::IO, pages::Vector{Page}, indent::AbstractString="") = show(io, [page.hdr for page in pages], indent) +# +# function show(io::IO, stat::Statistics, indent::AbstractString="") +# println(io, indent, "Statistics:") +# if Thrift.isfilled(stat, :min) && Thrift.isfilled(stat, :max) +# println(io, indent, " range:", stat.min, ":", stat.max) +# elseif Thrift.isfilled(stat, :min) +# println(io, indent, " min:", stat.min) +# elseif Thrift.isfilled(stat, :max) +# println(io, indent, " max:", stat.max) +# end +# Thrift.isfilled(stat, :null_count) && println(io, indent, " null count:", stat.null_count) +# Thrift.isfilled(stat, :distinct_count) && println(io, indent, " distinct count:", stat.distinct_count) +# end +# +# function show(io::IO, page_enc::PageEncodingStats, indent::AbstractString="") +# println(io, indent, page_enc.count, " ", Thrift.enumstr(Encoding, page_enc.encoding), " encoded ", Thrift.enumstr(PageType, page_enc.page_type), " pages") +# end +# +# function show(io::IO, page_encs::Vector{PageEncodingStats}, indent::AbstractString="") +# isempty(page_encs) && return +# println(io, indent, "Page encoding statistics:") +# for page_enc in page_encs +# show(io, page_enc, indent * " ") +# end +# end +# +# function show(io::IO, colmeta::ColumnMetaData, indent::AbstractString="") +# println(io, indent, Thrift.enumstr(_Type, coltype(colmeta)), " ", join(colname(colmeta), '.'), ", num values:", colmeta.num_values) +# show_encodings(io, colmeta.encodings, indent) +# if colmeta.codec != CompressionCodec.UNCOMPRESSED +# println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " compressed bytes:", colmeta.total_compressed_size, " (", colmeta.total_uncompressed_size, " uncompressed)") +# else +# println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " bytes:", colmeta.total_compressed_size) +# end +# +# print(io, indent, "offsets: data:", colmeta.data_page_offset) +# Thrift.isfilled(colmeta, :index_page_offset) && print(io, ", index:", colmeta.index_page_offset) +# Thrift.isfilled(colmeta, :dictionary_page_offset) && print(io, ", dictionary:", colmeta.dictionary_page_offset) +# println(io, "") +# Thrift.isfilled(colmeta, :statistics) && show(io, colmeta.statistics, indent) +# Thrift.isfilled(colmeta, :encoding_stats) && show(io, colmeta.encoding_stats, indent) +# Thrift.isfilled(colmeta, :key_value_metadata) && show(io, colmeta.key_value_metadata, indent) +# end +# +# function show(io::IO, columns::Vector{ColumnChunk}, indent::AbstractString="") +# for col in columns +# path = isfilled(col, :file_path) ? col.file_path : "" +# println(io, indent, "Column at offset: ", path, "#", col.file_offset) +# show(io, col.meta_data, indent * " ") +# end +# end +# +# function show(io::IO, grp::RowGroup, indent::AbstractString="") +# println(io, indent, "Row Group: ", grp.num_rows, " rows in ", grp.total_byte_size, " bytes") +# show(io, grp.columns, indent * " ") +# end +# +# function show(io::IO, row_groups::Vector{RowGroup}, indent::AbstractString="") +# println(io, indent, "Row Groups:") +# for grp in row_groups +# show(io, grp, indent * " ") +# end +# end +# +# function show(io::IO, meta::FileMetaData, indent::AbstractString="") +# println(io, indent, "version: ", meta.version) +# println(io, indent, "nrows: ", meta.num_rows) +# println(io, indent, "created by: ", meta.created_by) +# +# show(io, meta.schema, indent) +# show(io, meta.row_groups, indent) +# Thrift.isfilled(meta, :key_value_metadata) && show(io, meta.key_value_metadata, indent) +# end +# +# function show(io::IO, par::ParFile) +# println(io, "Parquet file: $(par.path)") +# meta = par.meta +# println(io, " version: $(meta.version)") +# println(io, " nrows: $(meta.num_rows)") +# println(io, " created by: $(meta.created_by)") +# println(io, " cached: $(length(par.page_cache.refs)) column chunks") +# end From 9f50dad013d453730aaed62c0d009ca73ed877aa Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Wed, 27 May 2020 19:43:01 +1000 Subject: [PATCH 50/52] minor bug fix --- src/column_reader.jl | 1 - src/column_reader_dev.jl | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index 1f67e05..f54a6bb 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -352,7 +352,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ else if has_missing raw_data = reinterpret(T, read(uncompressed_data_io)) - return raw_data, missing_bytes j = 1 for (i, missing_byte) in zip(from:to, missing_bytes) if missing_byte == 1 diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index 34e9a58..bcbf2d9 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -45,6 +45,8 @@ path = "c:/data/Performance_2003Q3.txt.parquet" @time adf = read_parquet(path); +adf.V5 + col_num = 1 @time col1 = Parquet.read_column(path, col_num); From 0c81da98a7686ba13176cb0723ac1e35349232d3 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Fri, 29 May 2020 14:30:13 +1000 Subject: [PATCH 51/52] before operating on misssing bytes --- src/column_reader.jl | 68 +++++++++++++++++++++++++++++----------- src/column_reader_dev.jl | 43 +++++++++++++------------ src/read_parquet.jl | 33 ++++++++++--------- 3 files changed, 88 insertions(+), 56 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index f54a6bb..34da73a 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -70,8 +70,23 @@ function read_column(path, filemetadata, col_num) end close(par) + fileio = open(path) + # I thnk there is a bug with Julia's multithreaded reads + # which can be fixed by doing the below + # DO NOT remove the code below or multithreading will fail + println("$(position(fileio))") + if true + not_used = open(tempname()*string(col_num), "w") + write(not_used, position(fileio)) + close(not_used) + end + + # to reduce allocations we make a compressed_data array to store compressed data + compressed_data_buffer = Vector{UInt8}(undef, 100) + compressed_data = UInt8[] # initialise it + from = 1 last_from = from @@ -82,7 +97,14 @@ function read_column(path, filemetadata, col_num) if isfilled(colchunk_meta, :dictionary_page_offset) seek(fileio, colchunk_meta.dictionary_page_offset) dict_page_header = read_thrift(fileio, PAR2.PageHeader) - compressed_data = read(fileio, dict_page_header.compressed_page_size) + + # use the + readbytes!(fileio, compressed_data_buffer, dict_page_header.compressed_page_size) + GC.@preserve compressed_data_buffer begin + compressed_data = unsafe_wrap(Vector{UInt8}, pointer(compressed_data_buffer), dict_page_header.compressed_page_size) + end + # compressed_data = read(fileio, dict_page_header.compressed_page_size) + uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec) @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size @@ -100,6 +122,10 @@ function read_column(path, filemetadata, col_num) end else dict = reinterpret(T, uncompressed_data) + # nvals = dict_page_header.dictionary_page_header.num_values + # GC.@preserve uncompressed_data begin + # dict = unsafe_wrap(Vector{T}, Ptr{T}(pointer(uncompressed_data)), nvals) + # end end else error("Only Plain Dictionary encoding is supported") @@ -111,8 +137,8 @@ function read_column(path, filemetadata, col_num) # seek to the first data page seek(fileio, colchunk_meta.data_page_offset) - # repeated read data page + # repeated read data page while (from - last_from < row_group.num_rows) & (from <= length(res)) from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) @@ -124,7 +150,7 @@ function read_column(path, filemetadata, col_num) end last_from = from - # (j == 1) && return res + # (j == 2) && return res j += 1 end @@ -141,15 +167,22 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ res_len = length(res) data_page_header = read_thrift(fileio, PAR2.PageHeader) - compressed_data = read(fileio, data_page_header.compressed_page_size) - uncompressed_data = decompress_with_codec(compressed_data, codec) + + #compressed_data = read(fileio, data_page_header.compressed_page_size) + compressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.compressed_page_size*1.5)) + + readbytes!(fileio, compressed_data_buffer, data_page_header.compressed_page_size) + GC.@preserve compressed_data_buffer begin + compressed_data = unsafe_wrap(Vector{UInt8}, pointer(compressed_data_buffer), data_page_header.compressed_page_size) + uncompressed_data = decompress_with_codec(compressed_data, codec) + end + @assert length(uncompressed_data) == data_page_header.uncompressed_page_size # this is made up of these 3 things written back to back # * repetition levels - can be ignored for unnested data # * definition levels - # * values - uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false) # this will be set in future @@ -158,10 +191,11 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # the number of values stored in this page num_values = data_page_header.data_page_header.num_values + # initialise it to something + missing_bytes = Vector{UInt8}(undef, num_values) + missing_bytes_io = IOBuffer(missing_bytes, write=true) + # definition levels - # do_read_defn_lvls = isfilled(data_page_header.data_page_header, :statistics) && - # isfilled(data_page_header.data_page_header.statistics, :null_count) && - # data_page_header.data_page_header.statistics.null_count > 0 if data_page_header.data_page_header.definition_level_encoding == PAR2.Encoding.RLE # for unnested columns the highest possible value for definiton is 1 # which can represented with just one bit so the bit width is always 1 @@ -173,13 +207,9 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ pos_after_reading_encoded_data = pos_before_encoded_data - # initialise it to something - missing_bytes = UInt8[] - while (pos_after_reading_encoded_data - pos_before_encoded_data) < encoded_data_len encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) - # TODO it's possible to be mixing RLE and bitpacked in one algorithm if iseven(encoded_data_header) # RLE encoded rle_len = Int(encoded_data_header >> 1) @@ -198,11 +228,11 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ end end - append!(missing_bytes, fill(rle_val, rle_len)) + write(missing_bytes_io, fill(rle_val, rle_len)) from_defn += rle_len - @assert from_defn - from == length(missing_bytes) - @assert length(missing_bytes) <= num_values + @assert from_defn - from == position(missing_bytes_io) + @assert position(missing_bytes_io) <= num_values else # the only reaosn to use bitpacking is because there are missings has_missing = true @@ -239,10 +269,10 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ len_to_write = new_from_defn - from_defn if len_to_write == len_of_tmp_missing_bytes - append!(missing_bytes, tmp_missing_bytes) + write(missing_bytes_io, tmp_missing_bytes) elseif len_to_write < len_of_tmp_missing_bytes tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write) - append!(missing_bytes, tmp_missing_bytes_smaller) + write(missing_bytes_io, tmp_missing_bytes_smaller) else error("something is wrong") end @@ -281,7 +311,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len if has_missing - @assert length(missing_bytes) == num_values + @assert position(missing_bytes_io) == num_values end diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index bcbf2d9..9834207 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -1,7 +1,21 @@ using Parquet -using Parquet:TYPES, read_thrift, PAR2, BitPackedIterator, decompress_with_codec -using Thrift: isfilled -using Snappy, CodecZlib, CodecZstd + +path = "c:/data/Performance_2003Q3.txt.parquet" +@time Parquet.read_column(path, 1); + + +@time read_parquet(path); + +path = "c:/git/parquet-data-collection/dsd50p.parquet" +@time adf = read_parquet(path); + +@time adf = read_parquet(path, multithreaded=false); + + + +using JDF: type_compress! + +@time adf = type_compress!(DataFrame(read_parquet(path, multithreaded=false), copycols=false)); using Random: randstring tbl = ( @@ -22,32 +36,17 @@ tbl = ( tmpfile = tempname()*".parquet" @time write_parquet(tmpfile, tbl); + path = tmpfile +@time adf = read_parquet(path); + +all([all(c1 .=== c2) for (c1, c2) in zip(tbl, adf)]) -col_num=12 -@time col1 = Parquet.read_column(path, col_num); -all(col1 .=== tbl.stringm) -a = read_parquet(path) using BenchmarkTools @benchmark adf = read_parquet(path) - -adf - - - - - -path = "c:/git/parquet-data-collection/dsd50p.parquet" -path = "c:/data/Performance_2003Q3.txt.parquet" - -@time adf = read_parquet(path); - -adf.V5 - - col_num = 1 @time col1 = Parquet.read_column(path, col_num); col1 diff --git a/src/read_parquet.jl b/src/read_parquet.jl index de953a9..68488ec 100644 --- a/src/read_parquet.jl +++ b/src/read_parquet.jl @@ -1,6 +1,6 @@ using Base.Threads: @spawn using Base.Iterators: drop -using ProgressMeter: @showprogress +using ProgressMeter: @showprogress, Progress, next! using NamedTupleTools: namedtuple read_parquet(path, cols::Vector{Symbol}; kwargs...) = read_parquet(path, String.(cols); kwargs...) @@ -25,25 +25,28 @@ function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose = results = Vector{Any}(undef, length(colnums)) - filemetadata = metadata(path) + filemetadata = metadata(path) - if multithreaded + symbol_col_names = collect(Symbol(col) for col in colnames[colnums]) + + p = Progress(length(colnums)) + if multithreaded for (i, j) in enumerate(colnums) - results[i] = @spawn read_column(path, filemetadata, j) - end - else - @showprogress for (i, j) in enumerate(colnums) - results[i] = read_column(path, filemetadata, j) + results[i] = @spawn begin + # next!(p) + res = read_column(path, filemetadata, j) + res + end + end + results = fetch.(results) + else + + for (i, j) in enumerate(colnums) + results[i] = read_column(path, filemetadata, j) + next!(p) end end - symbol_col_names = collect(Symbol(col) for col in colnames[colnums]) - - if multithreaded - @showprogress for i in 1:length(results) - results[i] = fetch(results[i]) - end - end return namedtuple(symbol_col_names, results) end From f6d2309c4df8cd97e40bd7e42e8e87ef1ebca400 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Sat, 30 May 2020 11:43:51 +1000 Subject: [PATCH 52/52] before major operation on cutting down on memory usage for missing --- src/column_reader.jl | 82 ++++++++++++++++++++++++++++------------ src/column_reader_dev.jl | 8 +++- 2 files changed, 64 insertions(+), 26 deletions(-) diff --git a/src/column_reader.jl b/src/column_reader.jl index 34da73a..e0a2eec 100644 --- a/src/column_reader.jl +++ b/src/column_reader.jl @@ -1,4 +1,4 @@ -import Base: iterate, length, IteratorSize, IteratorEltype, eltype +import Base: iterate, length, IteratorSize, IteratorEltype, eltype, @_gc_preserve_begin, @_gc_preserve_end const TYPES = (Bool, Int32, Int64, Int128, Float32, Float64, String, UInt8) @@ -43,9 +43,9 @@ function iterate(bp::BitPackedIterator, state) (value & UInt(2^bp.bitwidth-1), state + 1) end -function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UInt8} +function decompress_with_codec!(uncompressed_data::Vector{UInt8}, compressed_data::Vector{UInt8}, codec) if codec == PAR2.CompressionCodec.SNAPPY - uncompressed_data = Snappy.uncompress(compressed_data) + Snappy.snappy_uncompress(compressed_data, uncompressed_data) else error("codedc $codec unsupported atm") end @@ -70,18 +70,17 @@ function read_column(path, filemetadata, col_num) end close(par) - fileio = open(path) # I thnk there is a bug with Julia's multithreaded reads # which can be fixed by doing the below # DO NOT remove the code below or multithreading will fail println("$(position(fileio))") - if true - not_used = open(tempname()*string(col_num), "w") - write(not_used, position(fileio)) - close(not_used) - end + # if true + # not_used = open(tempname()*string(col_num), "w") + # write(not_used, position(fileio)) + # close(not_used) + # end # to reduce allocations we make a compressed_data array to store compressed data compressed_data_buffer = Vector{UInt8}(undef, 100) @@ -105,7 +104,9 @@ function read_column(path, filemetadata, col_num) end # compressed_data = read(fileio, dict_page_header.compressed_page_size) - uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec) + uncompressed_data = Vector{UInt8}(undef, dict_page_header.uncompressed_page_size) + + decompress_with_codec!(uncompressed_data, compressed_data, colchunk_meta.codec) @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY @@ -137,10 +138,12 @@ function read_column(path, filemetadata, col_num) # seek to the first data page seek(fileio, colchunk_meta.data_page_offset) + # the buffer is resizable and is used to reduce the amount of allocations + uncompressed_data_buffer = Vector{UInt8}(undef, 1048584) # repeated read data page while (from - last_from < row_group.num_rows) & (from <= length(res)) - from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + from = read_data_page_vals!(res, uncompressed_data_buffer, fileio, dict, colchunk_meta.codec, T, from) if from isa Tuple return from @@ -158,7 +161,7 @@ function read_column(path, filemetadata, col_num) res end -function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integer = 1) +function read_data_page_vals!(res, uncompressed_data_buffer::Vector{UInt8}, fileio::IOStream, dict, codec, T, from::Integer = 1) """ Read one data page """ @@ -168,29 +171,44 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ data_page_header = read_thrift(fileio, PAR2.PageHeader) + # the number of values stored in this page + num_values = data_page_header.data_page_header.num_values + # read values + to = from + num_values - 1 + @assert to <= res_len + #compressed_data = read(fileio, data_page_header.compressed_page_size) - compressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.compressed_page_size*1.5)) + compressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.compressed_page_size)) readbytes!(fileio, compressed_data_buffer, data_page_header.compressed_page_size) - GC.@preserve compressed_data_buffer begin + + # resize the buffer if it's too small + if data_page_header.uncompressed_page_size > length(uncompressed_data_buffer) + uncompressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.uncompressed_page_size*1.1)) + end + + t1 = @_gc_preserve_begin uncompressed_data_buffer + + GC.@preserve compressed_data_buffer uncompressed_data_buffer begin compressed_data = unsafe_wrap(Vector{UInt8}, pointer(compressed_data_buffer), data_page_header.compressed_page_size) - uncompressed_data = decompress_with_codec(compressed_data, codec) + uncompressed_data = unsafe_wrap(Vector{UInt8}, pointer(uncompressed_data_buffer), data_page_header.uncompressed_page_size) + # uncompressed_data = Vector{UInt8}(undef, data_page_header.uncompressed_page_size) + # decompression seems to be quite slow and uses lots of RAM! + decompress_with_codec!(uncompressed_data, compressed_data, codec) end @assert length(uncompressed_data) == data_page_header.uncompressed_page_size + uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false) + # this is made up of these 3 things written back to back # * repetition levels - can be ignored for unnested data # * definition levels - # * values - uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false) # this will be set in future has_missing = false - # the number of values stored in this page - num_values = data_page_header.data_page_header.num_values - # initialise it to something missing_bytes = Vector{UInt8}(undef, num_values) missing_bytes_io = IOBuffer(missing_bytes, write=true) @@ -222,6 +240,8 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ else # fill the memory location with all missing GC.@preserve res begin + # TODO there is a better way to locate the missing bytes + # find the location of missing dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1 tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, rle_len) fill!(tmparray, rle_val) @@ -315,12 +335,8 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ end - # read values - to = from + num_values - 1 - @assert to <= res_len if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN - # println("meh") # just return the data as is if T == Bool if has_missing @@ -381,7 +397,13 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ else if has_missing - raw_data = reinterpret(T, read(uncompressed_data_io)) + # raw_data = reinterpret(T, read(uncompressed_data_io)) + arr_pos = position(uncompressed_data_io) + 1 + # seek till the end + seek(uncompressed_data_io, uncompressed_data_io.size + 1) + # TODO remove this allocation too + ok = uncompressed_data[arr_pos:end] + raw_data = reinterpret(T, ok) j = 1 for (i, missing_byte) in zip(from:to, missing_bytes) if missing_byte == 1 @@ -411,10 +433,15 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ # the documented max bitwidth is @assert bitwidth <= 32 + rle_cnt = 0 + bp_cnt = 0 + rle_size = 0 + bp_size = 0 while !eof(uncompressed_data_io) encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32) if iseven(encoded_data_header) + rle_cnt += 1 # RLE encoded rle_len = Int(encoded_data_header >> 1) rle_val_vec::Vector{UInt8} = read(uncompressed_data_io, ceil(Int, bitwidth/8)) @@ -436,8 +463,10 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1] end + rle_size += rle_len from = from + rle_len else + bp_cnt += 1 # bitpacked encoded bit_pack_len = Int(encoded_data_header >> 1) @assert (bit_pack_len >= 1) && (bit_pack_len <= 2^31 - 1) @@ -465,13 +494,16 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ end end - + bp_size += l from = from + l end end + # println("rle_cnt $rle_cnt bp_cnt $bp_cnt rle_size $rle_size bp_size $bp_size") else erorr("encoding not supported") end + @_gc_preserve_end t2 + return to end diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl index 9834207..eb5e623 100644 --- a/src/column_reader_dev.jl +++ b/src/column_reader_dev.jl @@ -1,7 +1,13 @@ using Parquet path = "c:/data/Performance_2003Q3.txt.parquet" -@time Parquet.read_column(path, 1); +#Parquet.metadata(path) +@time col = Parquet.read_column(path, 5); + +for i in 1:31 + println(i) + @time Parquet.read_column(path, i); +end @time read_parquet(path);