Skip to content

Commit

Permalink
Parquet Writer #66
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaodaigh authored and tanmaykm committed May 24, 2020
1 parent 29308e3 commit 76d0718
Show file tree
Hide file tree
Showing 7 changed files with 716 additions and 8 deletions.
17 changes: 14 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,38 @@ name = "Parquet"
uuid = "626c502c-15b0-58ad-a749-f091afb673ae"
keywords = ["parquet", "julia", "columnar-storage"]
license = "MIT"
desc = "Julia implementation of parquet columnar file format reader"
version = "0.5.2"
desc = "Julia implementation of parquet columnar file format reader and writer"
version = "0.5.3"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22"

[compat]
CategoricalArrays = "0.6,0.7,0.8"
CodecZlib = "0.5,0.6,0.7"
CodecZstd = "0.6,0.7"
DataAPI = "1"
LittleEndianBase128 = "0.3"
MemPool = "0.2"
Missings = "0.3,0.4"
Snappy = "0.3"
Tables = "1"
Thrift = "0.6,0.7"
julia = "1"

[extras]
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
test = ["Test", "Random"]
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
[![Build status](https://ci.appveyor.com/api/projects/status/gx8pvdiiery74r9l/branch/master?svg=true)](https://ci.appveyor.com/project/tanmaykm/parquet-jl-cufdj/branch/master)
[![Coverage Status](https://coveralls.io/repos/github/JuliaIO/Parquet.jl/badge.svg?branch=master)](https://coveralls.io/github/JuliaIO/Parquet.jl?branch=master)

## Reader

Load a [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet). Only metadata is read initially, data is loaded in chunks on demand. (Note: [ParquetFiles.jl](https://github.com/queryverse/ParquetFiles.jl) also provides load support for Parquet files under the FileIO.jl package.)

`ParFile` represents a Parquet file at `path` open for reading. Options to map logical types can be provided via `map_logical_types`.
Expand Down Expand Up @@ -132,3 +134,31 @@ The reader will interpret logical types based on the `map_logical_types` provide
- `logical_string(v): Applicable for strings that are `BYTE_ARRAY` values. Without this, they are represented in a `Vector{UInt8}` type. With this they are converted to `String` types.

Variants of these methods or custom methods can also be applied by caller.

## Writer

You can write any Tables.jl column-accessible table that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64`.

However, `CategoricalArray`s are not yet supported. Furthermore, these types are not yet supported: `Int96`, `Int128`, `Date`, and `DateTime`.

### Writer Example

```julia
tbl = (
int32 = Int32.(1:1000),
int64 = Int64.(1:1000),
float32 = Float32.(1:1000),
float64 = Float64.(1:1000),
bool = rand(Bool, 1000),
string = [randstring(8) for i in 1:1000],
int32m = rand([missing, 1:100...], 1000),
int64m = rand([missing, 1:100...], 1000),
float32m = rand([missing, Float32.(1:100)...], 1000),
float64m = rand([missing, Float64.(1:100)...], 1000),
boolm = rand([missing, true, false], 1000),
stringm = rand([missing, "abc", "def", "ghi"], 1000)
)

file = tempname()*".parquet"
write_parquet(file, tbl)
```
8 changes: 8 additions & 0 deletions src/Parquet.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@ using CodecZstd
using MemPool
using Dates

if VERSION < v"1.3"
using Missings: nonmissingtype
end

const PARQUET_JL_VERSION = v"0.5.3"

import Base: show, open, close, values, eltype, length
import Thrift: isfilled

export is_par_file, ParFile, show, nrows, ncols, rowgroups, columns, pages, bytes, values, colname, colnames
export schema
export logical_timestamp, logical_string
export RecordCursor, BatchedColumnsCursor
export write_parquet

# package code goes here
include("PAR2/PAR2.jl")
Expand All @@ -23,5 +30,6 @@ include("schema.jl")
include("reader.jl")
include("cursor.jl")
include("show.jl")
include("writer.jl")

end # module
10 changes: 5 additions & 5 deletions src/reader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,11 @@ end
function read_levels_and_nmissing(io, defn_enc::Int32, repn_enc::Int32, num_values::Int32, par::ParFile, page::Page, defn_levels::Vector{Int32}, repn_levels::Vector{Int32}, defn_offset::Int=0, repn_offset::Int=0)
cname = colname(page.colchunk)

#@debug("before reading repn levels bytesavailable in page: $(bytesavailable(io))")
# read repetition levels. skipped if all columns are at 1st level
max_repn_level = max_repetition_level(par.schema, cname)
((length(cname) > 1) && (max_repn_level > 0)) && read_levels(io, max_repn_level, repn_enc, num_values, repn_levels, repn_offset)

#@debug("before reading defn levels bytesavailable in page: $(bytesavailable(io))")
# read definition levels. skipped if column is required
nmissing = Int32(0)
Expand All @@ -332,11 +337,6 @@ function read_levels_and_nmissing(io, defn_enc::Int32, repn_enc::Int32, num_valu
end
end

#@debug("before reading repn levels bytesavailable in page: $(bytesavailable(io))")
# read repetition levels. skipped if all columns are at 1st level
max_repn_level = max_repetition_level(par.schema, cname)
((length(cname) > 1) && (max_repn_level > 0)) && read_levels(io, max_repn_level, repn_enc, num_values, repn_levels, repn_offset)

nmissing
end

Expand Down
Loading

0 comments on commit 76d0718

Please sign in to comment.