Parquet Writer #66

JuliaIO · May 24, 2020 · 76d0718 · 76d0718
1 parent 29308e3
commit 76d0718
Show file tree

Hide file tree

Showing 7 changed files with 716 additions and 8 deletions.
diff --git a/Project.toml b/Project.toml
@@ -2,27 +2,38 @@ name = "Parquet"
 uuid = "626c502c-15b0-58ad-a749-f091afb673ae"
 keywords = ["parquet", "julia", "columnar-storage"]
 license = "MIT"
-desc = "Julia implementation of parquet columnar file format reader"
-version = "0.5.2"
+desc = "Julia implementation of parquet columnar file format reader and writer"
+version = "0.5.3"
 
 [deps]
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
+DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
+Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22"
 
 [compat]
+CategoricalArrays = "0.6,0.7,0.8"
 CodecZlib = "0.5,0.6,0.7"
 CodecZstd = "0.6,0.7"
+DataAPI = "1"
+LittleEndianBase128 = "0.3"
 MemPool = "0.2"
+Missings = "0.3,0.4"
 Snappy = "0.3"
+Tables = "1"
 Thrift = "0.6,0.7"
 julia = "1"
 
 [extras]
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "Random"]
diff --git a/README.md b/README.md
@@ -4,6 +4,8 @@
 [![Build status](https://ci.appveyor.com/api/projects/status/gx8pvdiiery74r9l/branch/master?svg=true)](https://ci.appveyor.com/project/tanmaykm/parquet-jl-cufdj/branch/master)
 [![Coverage Status](https://coveralls.io/repos/github/JuliaIO/Parquet.jl/badge.svg?branch=master)](https://coveralls.io/github/JuliaIO/Parquet.jl?branch=master)
 
+## Reader
+
 Load a [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet). Only metadata is read initially, data is loaded in chunks on demand. (Note: [ParquetFiles.jl](https://github.com/queryverse/ParquetFiles.jl) also provides load support for Parquet files under the FileIO.jl package.)
 
 `ParFile` represents a Parquet file at `path` open for reading. Options to map logical types can be provided via `map_logical_types`.
@@ -132,3 +134,31 @@ The reader will interpret logical types based on the `map_logical_types` provide
 - `logical_string(v): Applicable for strings that are `BYTE_ARRAY` values. Without this, they are represented in a `Vector{UInt8}` type. With this they are converted to `String` types.
 
 Variants of these methods or custom methods can also be applied by caller.
+
+## Writer
+
+You can write any Tables.jl column-accessible table that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64`.
+
+However, `CategoricalArray`s are not yet supported. Furthermore, these types are not yet supported: `Int96`, `Int128`, `Date`, and `DateTime`.
+
+### Writer Example
+
+```julia
+tbl = (
+    int32 = Int32.(1:1000),
+    int64 = Int64.(1:1000),
+    float32 = Float32.(1:1000),
+    float64 = Float64.(1:1000),
+    bool = rand(Bool, 1000),
+    string = [randstring(8) for i in 1:1000],
+    int32m = rand([missing, 1:100...], 1000),
+    int64m = rand([missing, 1:100...], 1000),
+    float32m = rand([missing, Float32.(1:100)...], 1000),
+    float64m = rand([missing, Float64.(1:100)...], 1000),
+    boolm = rand([missing, true, false], 1000),
+    stringm = rand([missing, "abc", "def", "ghi"], 1000)
+)
+
+file = tempname()*".parquet"
+write_parquet(file, tbl)
+```
diff --git a/src/Parquet.jl b/src/Parquet.jl
@@ -7,13 +7,20 @@ using CodecZstd
 using MemPool
 using Dates
 
+if VERSION < v"1.3"
+    using Missings: nonmissingtype
+end
+
+const PARQUET_JL_VERSION = v"0.5.3"
+
 import Base: show, open, close, values, eltype, length
 import Thrift: isfilled
 
 export is_par_file, ParFile, show, nrows, ncols, rowgroups, columns, pages, bytes, values, colname, colnames
 export schema
 export logical_timestamp, logical_string
 export RecordCursor, BatchedColumnsCursor
+export write_parquet
 
 # package code goes here
 include("PAR2/PAR2.jl")
@@ -23,5 +30,6 @@ include("schema.jl")
 include("reader.jl")
 include("cursor.jl")
 include("show.jl")
+include("writer.jl")
 
 end # module
diff --git a/src/reader.jl b/src/reader.jl
@@ -322,6 +322,11 @@ end
 function read_levels_and_nmissing(io, defn_enc::Int32, repn_enc::Int32, num_values::Int32, par::ParFile, page::Page, defn_levels::Vector{Int32}, repn_levels::Vector{Int32}, defn_offset::Int=0, repn_offset::Int=0)
     cname = colname(page.colchunk)
 
+    #@debug("before reading repn levels bytesavailable in page: $(bytesavailable(io))")
+    # read repetition levels. skipped if all columns are at 1st level
+    max_repn_level = max_repetition_level(par.schema, cname)
+    ((length(cname) > 1) && (max_repn_level > 0)) && read_levels(io, max_repn_level, repn_enc, num_values, repn_levels, repn_offset)
+
     #@debug("before reading defn levels bytesavailable in page: $(bytesavailable(io))")
     # read definition levels. skipped if column is required
     nmissing = Int32(0)
@@ -332,11 +337,6 @@ function read_levels_and_nmissing(io, defn_enc::Int32, repn_enc::Int32, num_valu
         end
     end
 
-    #@debug("before reading repn levels bytesavailable in page: $(bytesavailable(io))")
-    # read repetition levels. skipped if all columns are at 1st level
-    max_repn_level = max_repetition_level(par.schema, cname)
-    ((length(cname) > 1) && (max_repn_level > 0)) && read_levels(io, max_repn_level, repn_enc, num_values, repn_levels, repn_offset)
-
     nmissing
 end