From ba5105002f05fa288123af640a8f8d1729341064 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 13 May 2020 13:54:02 +1000
Subject: [PATCH 01/52] before merge with remote master

---
 Project.toml   | 14 +++++++++++++-
 src/Parquet.jl |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 8469635..2630be1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,21 +2,33 @@ name = "Parquet"
 uuid = "626c502c-15b0-58ad-a749-f091afb673ae"
 keywords = ["parquet", "julia", "columnar-storage"]
 license = "MIT"
-desc = "Julia implementation of parquet columnar file format reader"
+desc = "Julia implementation of parquet columnar file format reader and writer"
 version = "0.3.2"
 
 [deps]
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
+CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
+DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22"
 
 [compat]
+CategoricalArrays = "0.6,0.7,0.8"
 CodecZlib = "0.5,0.6,0.7"
+CodecZstd = "0.7"
+DataAPI = "1"
+LittleEndianBase128 = "0.3"
 MemPool = "0.2"
+ProgressMeter = "1"
 ProtoBuf = "0.7,0.8"
 Snappy = "0.3"
+Tables = "1"
 Thrift = "0.6"
 julia = "1"
 
diff --git a/src/Parquet.jl b/src/Parquet.jl
index 160faf6..f86b458 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -13,6 +13,7 @@ export is_par_file, ParFile, show, nrows, ncols, rowgroups, columns, pages, byte
 export SchemaConverter, schema, JuliaConverter, ThriftConverter, ProtoConverter
 export RowCursor, ColCursor, RecCursor
 export AbstractBuilder, JuliaBuilder
+export write_parquet
 
 # package code goes here
 include("PAR2/PAR2.jl")
@@ -22,5 +23,6 @@ include("schema.jl")
 include("reader.jl")
 include("cursor.jl")
 include("show.jl")
+include("writer.jl")
 
 end # module

From 39323df31e4abcc713a6b150733b9a842bfaf30f Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 13 May 2020 14:09:37 +1000
Subject: [PATCH 02/52] adding tests

---
 Project.toml        |  2 +-
 test/test_writer.jl | 11 +++++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index 1436e3b..9564631 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
 CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
 DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -17,7 +18,6 @@ ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22"
-Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
 [compat]
 CategoricalArrays = "0.6,0.7,0.8"
diff --git a/test/test_writer.jl b/test/test_writer.jl
index 8e55412..23805e1 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -4,7 +4,7 @@ using Random:randstring
 
 tbl = (
     int32 = Int32.(1:1000),
-    int64 = Int32.(1:1000),
+    int64 = Int64.(1:1000),
     float32 = Float32.(1:1000),
     float64 = Float64.(1:1000),
     bool = rand(Bool, 1000),
@@ -14,10 +14,17 @@ tbl = (
     float32m = rand([missing, Float32.(1:100)...], 1000),
     float64m = rand([missing, Float64.(1:100)...], 1000),
     boolm = rand([missing, true, false], 1000),
+    stringm = rand([missing, "abc", "def", "ghi"], 1000)
 )
 
 write_parquet("tmp.parquet", tbl)
 
-ParFile("tmp.parquet")
+pf = ParFile("tmp.parquet")
+col_chunks = columns(pf, 1)
+vals = values.(Ref(pf), Ref(col_chunks), 1:length(col_chunks))
+
+vals = values(pf, col_chunks, 5)
+vals = values(pf, col_chunks, 6)
+vals = values(pf, col_chunks, 7)
 
 rm("tmp.parquet")

From e41a113d150145f89bbfcdced4917d047f4715f2 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 13 May 2020 14:36:46 +1000
Subject: [PATCH 03/52] added tests for wrtier

---
 src/reader.jl       |  7 +++--
 test/test_writer.jl | 77 ++++++++++++++++++++++++++++-----------------
 2 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/src/reader.jl b/src/reader.jl
index 518d054..f86f44b 100644
--- a/src/reader.jl
+++ b/src/reader.jl
@@ -1,4 +1,3 @@
-
 const PAR_MAGIC = "PAR1"
 const SZ_PAR_MAGIC = length(PAR_MAGIC)
 const SZ_FOOTER = 4
@@ -58,6 +57,10 @@ function ParFile(path::AbstractString, handle::IOStream; maxcache::Integer=10)
     ParFile(path, handle, meta, Schema(meta.schema), PageLRU())
 end
 
+function Base.close(par::ParFile)
+    close(par.handle)
+end
+
 ##
 # layer 1 access
 # can access raw (uncompressed) bytes from pages
@@ -371,7 +374,7 @@ function is_par_file(io)
     magic = Array{UInt8}(undef, 4)
     read!(io, magic)
     (String(magic) == PAR_MAGIC) || return false
-    
+
     seek(io, sz - SZ_PAR_MAGIC)
     magic = Array{UInt8}(undef, 4)
     read!(io, magic)
diff --git a/test/test_writer.jl b/test/test_writer.jl
index 23805e1..ffd9c95 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -1,30 +1,51 @@
 using Parquet
 using Test
-using Random:randstring
-
-tbl = (
-    int32 = Int32.(1:1000),
-    int64 = Int64.(1:1000),
-    float32 = Float32.(1:1000),
-    float64 = Float64.(1:1000),
-    bool = rand(Bool, 1000),
-    string = [randstring(8) for i in 1:1000],
-    int32m = rand([missing, 1:100...], 1000),
-    int64m = rand([missing, 1:100...], 1000),
-    float32m = rand([missing, Float32.(1:100)...], 1000),
-    float64m = rand([missing, Float64.(1:100)...], 1000),
-    boolm = rand([missing, true, false], 1000),
-    stringm = rand([missing, "abc", "def", "ghi"], 1000)
-)
-
-write_parquet("tmp.parquet", tbl)
-
-pf = ParFile("tmp.parquet")
-col_chunks = columns(pf, 1)
-vals = values.(Ref(pf), Ref(col_chunks), 1:length(col_chunks))
-
-vals = values(pf, col_chunks, 5)
-vals = values(pf, col_chunks, 6)
-vals = values(pf, col_chunks, 7)
-
-rm("tmp.parquet")
+using Random
+
+Random.seed!(1234567)
+
+function test_write()
+    tbl = (
+        int32 = Int32.(1:1000),
+        int64 = Int64.(1:1000),
+        float32 = Float32.(1:1000),
+        float64 = Float64.(1:1000),
+        bool = rand(Bool, 1000),
+        string = [randstring(8) for i in 1:1000],
+        int32m = rand([missing, 1:100...], 1000),
+        int64m = rand([missing, 1:100...], 1000),
+        float32m = rand([missing, Float32.(1:100)...], 1000),
+        float64m = rand([missing, Float64.(1:100)...], 1000),
+        boolm = rand([missing, true, false], 1000),
+        stringm = rand([missing, "abc", "def", "ghi"], 1000)
+    )
+
+    write_parquet("tmp_plsdel.parquet", tbl)
+
+    pf = ParFile("tmp_plsdel.parquet")
+
+    # the file is very smalll so only one rowgroup
+    col_chunks = columns(pf, 1)
+
+    for colnum in 1:length(col_chunks)
+        correct_vals = tbl[colnum]
+        coltype = eltype(correct_vals)
+        vals_from_file = values(pf, col_chunks, colnum)
+        if Missing <: coltype
+            @test ismissing.(correct_vals) == (vals_from_file[2] .== 0)
+        end
+
+        if nonmissingtype(coltype) == String
+            @test all(skipmissing(correct_vals) .== String.(vals_from_file[1]))
+        else
+            @test all(skipmissing(correct_vals) .== vals_from_file[1])
+        end
+    end
+
+    # clean up
+    close(pf)
+
+    #rm("tmp_plsdel.parquet")
+end
+
+test_write()

From be7fb944655f2d433e3b2b09a3331fb05cd9f164 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 13 May 2020 14:43:01 +1000
Subject: [PATCH 04/52] added readme for test write and used tempname()

---
 README.md           | 29 ++++++++++++++++++++++++++++-
 test/test_writer.jl |  7 ++++---
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 7b0ecf4..90af020 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,8 @@
 [![Build Status](https://travis-ci.org/JuliaIO/Parquet.jl.svg?branch=master)](https://travis-ci.org/JuliaIO/Parquet.jl)
 [![Build status](https://ci.appveyor.com/api/projects/status/vrqg01w2sj3mfk3d/branch/master?svg=true)](https://ci.appveyor.com/project/tanmaykm/parquet-jl/branch/master)
 
+## Reader
+
 Load a [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet). Only metadata is read initially, data is loaded in chunks on demand. (Note: [ParquetFiles.jl](https://github.com/queryverse/ParquetFiles.jl) also provides load support for Parquet files under the FileIO.jl package.)
 
 ```julia
@@ -31,7 +33,7 @@ julia> colnames(p)
 8-element Array{AbstractString,1}:
  "c_acctbal"   
  "c_mktsegment"
- "c_nationkey" 
+ "c_nationkey"
  "c_name"      
  "c_address"   
  "c_custkey"   
@@ -140,3 +142,28 @@ julia> for v in values
 04/01/09, 2009-04-01T12:01:00
 ```
 
+## Writer
+
+You can write any Tables.jl column accessible tables that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64`
+
+### Writer Example
+
+```julia
+tbl = (
+    int32 = Int32.(1:1000),
+    int64 = Int64.(1:1000),
+    float32 = Float32.(1:1000),
+    float64 = Float64.(1:1000),
+    bool = rand(Bool, 1000),
+    string = [randstring(8) for i in 1:1000],
+    int32m = rand([missing, 1:100...], 1000),
+    int64m = rand([missing, 1:100...], 1000),
+    float32m = rand([missing, Float32.(1:100)...], 1000),
+    float64m = rand([missing, Float64.(1:100)...], 1000),
+    boolm = rand([missing, true, false], 1000),
+    stringm = rand([missing, "abc", "def", "ghi"], 1000)
+)
+
+file = tempname()*".parquet"
+write_parquet(file, tbl)
+```
diff --git a/test/test_writer.jl b/test/test_writer.jl
index ffd9c95..61592af 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -20,9 +20,10 @@ function test_write()
         stringm = rand([missing, "abc", "def", "ghi"], 1000)
     )
 
-    write_parquet("tmp_plsdel.parquet", tbl)
+    tmpfile = tempname()*".parquet"
+    write_parquet(tmpfile, tbl)
 
-    pf = ParFile("tmp_plsdel.parquet")
+    pf = ParFile(tmpfile)
 
     # the file is very smalll so only one rowgroup
     col_chunks = columns(pf, 1)
@@ -45,7 +46,7 @@ function test_write()
     # clean up
     close(pf)
 
-    #rm("tmp_plsdel.parquet")
+    #rm(tmpfile)
 end
 
 test_write()

From 866323edb953a5e7c561f9ef176f57ddaf860dba Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 13 May 2020 14:44:35 +1000
Subject: [PATCH 05/52] fixed project.toml adding random

---
 Project.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 9564631..3701976 100644
--- a/Project.toml
+++ b/Project.toml
@@ -34,7 +34,8 @@ Thrift = "0.6"
 julia = "1"
 
 [extras]
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "Random"]

From f1e70c8415f265b53f717d0709860a8f0429eb5e Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 13 May 2020 14:56:02 +1000
Subject: [PATCH 06/52] added version to writer

---
 Project.toml   | 1 +
 src/Parquet.jl | 3 +++
 src/writer.jl  | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 3701976..54f72cf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,6 +13,7 @@ DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
diff --git a/src/Parquet.jl b/src/Parquet.jl
index 149da23..6a5ce50 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -7,6 +7,9 @@ using CodecZlib
 using MemPool
 using Dates
 
+using Pkg
+const PARQUET_JL_VERSION = VersionNumber(Pkg.TOML.parsefile(joinpath(@__DIR__, "..", "Project.toml"))["version"])
+
 import Base: show, open, close, values
 import Thrift: isfilled
 
diff --git a/src/writer.jl b/src/writer.jl
index 9d02561..2b1ff64 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -499,7 +499,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
     Thrift.set_field!(filemetadata, :version, 1)
     Thrift.set_field!(filemetadata, :schema, schemas)
     Thrift.set_field!(filemetadata, :num_rows, nrows)
-    Thrift.set_field!(filemetadata, :created_by, "Parquet.jl")
+    Thrift.set_field!(filemetadata, :created_by, "Parquet.jl $(Parquet.PARQUET_JL_VERSION)")
 
     # create row_groups
     # TODO do multiple row_groups

From 40cbfefb9e865bdd8620c4021b626bdde4eecd00 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 13 May 2020 21:19:47 +1000
Subject: [PATCH 07/52] added missing for Julia 1.0.5

---
 Project.toml   | 2 ++
 src/Parquet.jl | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/Project.toml b/Project.toml
index 39a804b..b2a04de 100644
--- a/Project.toml
+++ b/Project.toml
@@ -13,6 +13,7 @@ DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
+Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429"
@@ -27,6 +28,7 @@ CodecZstd = "0.6,0.7"
 DataAPI = "1"
 LittleEndianBase128 = "0.3"
 MemPool = "0.2"
+Missings = "0.3,0.4"
 ProgressMeter = "1"
 ProtoBuf = "0.7,0.8"
 Snappy = "0.3"
diff --git a/src/Parquet.jl b/src/Parquet.jl
index ca2054a..748cc67 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -8,6 +8,10 @@ using CodecZstd
 using MemPool
 using Dates
 
+if VERSION < v"1.3"
+    using Missings: nonmissingtype
+end
+
 using Pkg
 const PARQUET_JL_VERSION = VersionNumber(Pkg.TOML.parsefile(joinpath(@__DIR__, "..", "Project.toml"))["version"])
 

From be5e64cb2341aca9bd4da65ec80c11612e76c201 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Thu, 14 May 2020 23:46:35 +1000
Subject: [PATCH 08/52] removed progress meter

---
 Project.toml  | 2 --
 README.md     | 4 +++-
 src/writer.jl | 3 +--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Project.toml b/Project.toml
index b2a04de..be869a4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -15,7 +15,6 @@ LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
@@ -29,7 +28,6 @@ DataAPI = "1"
 LittleEndianBase128 = "0.3"
 MemPool = "0.2"
 Missings = "0.3,0.4"
-ProgressMeter = "1"
 ProtoBuf = "0.7,0.8"
 Snappy = "0.3"
 Tables = "1"
diff --git a/README.md b/README.md
index 90af020..6ac9e87 100644
--- a/README.md
+++ b/README.md
@@ -144,7 +144,9 @@ julia> for v in values
 
 ## Writer
 
-You can write any Tables.jl column accessible tables that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64`
+You can write any Tables.jl column-accessible table that contains columns of these types and their union with `Missing`: `Int32`, `Int64`, `String`, `Bool`, `Float32`, `Float64`.
+
+However, `CategoricalArray`s are not yet supported. Furthermore, these types are not yet supported: `Int96`, `Int128`, `Date`, and `DateTime`.
 
 ### Writer Example
 
diff --git a/src/writer.jl b/src/writer.jl
index 2b1ff64..4ad0464 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -8,7 +8,6 @@ using CodecZlib: GzipCompressor
 using LittleEndianBase128
 using Base.Iterators: partition
 using CategoricalArrays: CategoricalArray, CategoricalValue
-using ProgressMeter
 
 # a mapping of Julia types to _Type codes in Parquet format
 const COL_TYPE_CODE = Dict{DataType, Int32}(
@@ -429,7 +428,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
         recommended_chunks = 1
     end
 
-    @showprogress for (coli, colname_sym) in enumerate(colnames)
+    for (coli, colname_sym) in enumerate(colnames)
         colvals = Tables.getcolumn(tbl, colname_sym)
         colname = String(colname_sym)
 

From c97a06744157bbbb3dce86ae4757618fff96c174 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Thu, 14 May 2020 23:52:43 +1000
Subject: [PATCH 09/52] typo

---
 src/writer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/writer.jl b/src/writer.jl
index 4ad0464..b7e6b70 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -22,7 +22,7 @@ const COL_TYPE_CODE = Dict{DataType, Int32}(
     )
 
 function write_thrift(fileio, thrift_obj)
-    """write thrift defiition to file"""
+    """write thrift definition to file"""
     p = TCompactProtocol(TFileTransport(fileio))
     Thrift.write(p, thrift_obj)
 end

From c97b56b8482c08c8011f9628b2eb787f87395fab Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Fri, 15 May 2020 00:35:46 +1000
Subject: [PATCH 10/52] fixed julia fail bug

---
 test/test_writer.jl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index 61592af..dbee089 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -2,6 +2,10 @@ using Parquet
 using Test
 using Random
 
+if VERSION < v"1.3"
+    using Missings: nonmissingtype
+end
+
 Random.seed!(1234567)
 
 function test_write()

From 34ed20a3f457a07ae28bf81655a2a9a5d3534fd2 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 12:09:18 +1000
Subject: [PATCH 11/52] Update src/writer.jl

---
 src/writer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/writer.jl b/src/writer.jl
index b7e6b70..8567e15 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -75,7 +75,7 @@ function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T
     # do not support dictionary with more than 127 levels
     # TODO relax this 127 restriction
     if length(uvals) > 127
-        @warn "More than 127 levels in dictionary. This is not supported at this stage."
+        @warn "More than 127 levels in dictionary. Parquet.jl does not support this at this stage."
         return (offset = missing, uncompressed_size = 0, compressed_size = 0)
     end
 

From 781ff7d1ae665ff8cedffd6c5f19a69929fbbc37 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 12:28:58 +1000
Subject: [PATCH 12/52] minor refactor

---
 src/writer.jl | 275 +++++++++++++++++++++++++-------------------------
 1 file changed, 139 insertions(+), 136 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index b7e6b70..dfd2cb2 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -14,11 +14,11 @@ const COL_TYPE_CODE = Dict{DataType, Int32}(
     Bool => PAR2._Type.BOOLEAN,
     Int32 => PAR2._Type.INT32,
     Int64 => PAR2._Type.INT64,
-    #INT96 => 3,  // deprecated, only used by legacy implementations. # not supported
+    #INT96 => 3,  // deprecated, only used by legacy implementations. # not supported by Parquet.jl
     Float32 => PAR2._Type.FLOAT,
     Float64 => PAR2._Type.DOUBLE,
     String => PAR2._Type.BYTE_ARRAY, # BYTE_ARRAY
-    # FIXED_LEN_BYTE_ARRAY => 7,
+    # FIXED_LEN_BYTE_ARRAY => 7, # current there is no Julia type that we support that maps to this type
     )
 
 function write_thrift(fileio, thrift_obj)
@@ -120,167 +120,170 @@ function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T
 end
 
 # TODO set the encoding code into a dictionary
-function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, encoding) where T
-    """write a chunk of data into a data page"""
-    if encoding == PAR2.Encoding.PLAIN
-        # generate the data page header
-        data_page_header = PAR2.PageHeader()
-
-        # write repetition level data
-        # do nothing
-        # this seems to be related to nested columns
-        # and hence is not needed here
-
-        # set up a buffer to write to
-        data_to_compress_io = IOBuffer()
-
-        if Missing <: T
-            # if there is missing
-            # use the bit packing algorithm to write the
-            # definition_levels
-
-            bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8))
-            tmp = UInt32((UInt32(bytes_needed) << 1) | 1)
-            bitpacking_header = LittleEndianBase128.encode(tmp)
-
-            tmpio = IOBuffer()
-            not_missing_bits::BitArray = .!ismissing.(colvals)
-            write(tmpio, not_missing_bits)
-            seek(tmpio, 0)
-
-            encoded_defn_data = read(tmpio, bytes_needed)
-
-            encoded_defn_data_length = length(bitpacking_header) + bytes_needed
-            # write the definition data
-            write(data_to_compress_io, UInt32(encoded_defn_data_length))
-            write(data_to_compress_io, bitpacking_header)
-            write(data_to_compress_io, encoded_defn_data)
-        else
-            # if there is no missing can just use RLE of one
-            # using rle
-            rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
-            repeated_value = UInt8(1)
-            encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value))
-
-            # write the definition data
-            write(data_to_compress_io, UInt32(encoded_defn_data_length))
-            write(data_to_compress_io, rle_header)
-            write(data_to_compress_io, repeated_value)
-        end
+function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.Encoding.PLAIN}) where T
+    """write a chunk of data into a data page using PLAIN encoding"""
 
-        if nonmissingtype(T) == String
-            # write the values
-            for val in skipmissing(colvals)
-                # for string it needs to be stored as BYTE_ARRAY which needs the length
-                # to be the first 4 bytes UInt32
-                write(data_to_compress_io, val |> sizeof |> UInt32)
-                # write each of the strings one after another
-                write(data_to_compress_io, val)
-            end
-        elseif nonmissingtype(T) == Bool
-            # write the bitacpked bits
-            # write a bitarray seems to write 8 bytes at a time
-            # so write to a tmpio first
-            no_missing_bit_vec =  BitArray(skipmissing(colvals))
-            bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8))
-            tmpio = IOBuffer()
-            write(tmpio, no_missing_bit_vec)
-            seek(tmpio, 0)
-            packed_bits = read(tmpio, bytes_needed)
-            write(data_to_compress_io, packed_bits)
-        else
-            for val in skipmissing(colvals)
-                write(data_to_compress_io, val)
-            end
+    # generate the data page header
+    data_page_header = PAR2.PageHeader()
+
+    # write repetition level data
+    # do nothing
+    # this seems to be related to nested columns
+    # and hence is not needed here
+
+    # set up a buffer to write to
+    data_to_compress_io = IOBuffer()
+
+    if Missing <: T
+        # if there is missing
+        # use the bit packing algorithm to write the
+        # definition_levels
+
+        bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8))
+        tmp = UInt32((UInt32(bytes_needed) << 1) | 1)
+        bitpacking_header = LittleEndianBase128.encode(tmp)
+
+        tmpio = IOBuffer()
+        not_missing_bits::BitArray = .!ismissing.(colvals)
+        write(tmpio, not_missing_bits)
+        seek(tmpio, 0)
+
+        encoded_defn_data = read(tmpio, bytes_needed)
+
+        encoded_defn_data_length = length(bitpacking_header) + bytes_needed
+        # write the definition data
+        write(data_to_compress_io, UInt32(encoded_defn_data_length))
+        write(data_to_compress_io, bitpacking_header)
+        write(data_to_compress_io, encoded_defn_data)
+    else
+        # if there is no missing can just use RLE of one
+        # using rle
+        rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
+        repeated_value = UInt8(1)
+        encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value))
+
+        # write the definition data
+        write(data_to_compress_io, UInt32(encoded_defn_data_length))
+        write(data_to_compress_io, rle_header)
+        write(data_to_compress_io, repeated_value)
+    end
+
+    if nonmissingtype(T) == String
+        # write the values
+        for val in skipmissing(colvals)
+            # for string it needs to be stored as BYTE_ARRAY which needs the length
+            # to be the first 4 bytes UInt32
+            write(data_to_compress_io, val |> sizeof |> UInt32)
+            # write each of the strings one after another
+            write(data_to_compress_io, val)
         end
+    elseif nonmissingtype(T) == Bool
+        # write the bitacpked bits
+        # write a bitarray seems to write 8 bytes at a time
+        # so write to a tmpio first
+        no_missing_bit_vec =  BitArray(skipmissing(colvals))
+        bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8))
+        tmpio = IOBuffer()
+        write(tmpio, no_missing_bit_vec)
+        seek(tmpio, 0)
+        packed_bits = read(tmpio, bytes_needed)
+        write(data_to_compress_io, packed_bits)
+    else
+        for val in skipmissing(colvals)
+            write(data_to_compress_io, val)
+        end
+    end
 
-        data_to_compress::Vector{UInt8} = take!(data_to_compress_io)
+    data_to_compress::Vector{UInt8} = take!(data_to_compress_io)
 
-        compressed_data::Vector{UInt8} = compress_using_codec(data_to_compress, codec)
+    compressed_data::Vector{UInt8} = compress_using_codec(data_to_compress, codec)
 
-        uncompressed_page_size = length(data_to_compress)
-        compressed_page_size = length(compressed_data)
+    uncompressed_page_size = length(data_to_compress)
+    compressed_page_size = length(compressed_data)
 
-        Thrift.set_field!(data_page_header, :_type, PAR2.PageType.DATA_PAGE)
-        Thrift.set_field!(data_page_header, :uncompressed_page_size, uncompressed_page_size)
-        Thrift.set_field!(data_page_header, :compressed_page_size, compressed_page_size)
+    Thrift.set_field!(data_page_header, :_type, PAR2.PageType.DATA_PAGE)
+    Thrift.set_field!(data_page_header, :uncompressed_page_size, uncompressed_page_size)
+    Thrift.set_field!(data_page_header, :compressed_page_size, compressed_page_size)
 
-        # TODO proper CRC
-        Thrift.set_field!(data_page_header, :crc , 0)
+    # TODO proper CRC
+    Thrift.set_field!(data_page_header, :crc , 0)
 
-        Thrift.set_field!(data_page_header, :data_page_header, PAR2.DataPageHeader())
-        Thrift.set_field!(data_page_header.data_page_header, :num_values , Int32(length(colvals)))
-        Thrift.set_field!(data_page_header.data_page_header, :encoding , encoding) # encoding 0 is plain encoding
-        Thrift.set_field!(data_page_header.data_page_header, :definition_level_encoding, PAR2.Encoding.RLE)
-        Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE)
+    Thrift.set_field!(data_page_header, :data_page_header, PAR2.DataPageHeader())
+    Thrift.set_field!(data_page_header.data_page_header, :num_values , Int32(length(colvals)))
+    Thrift.set_field!(data_page_header.data_page_header, :encoding , encoding) # encoding 0 is plain encoding
+    Thrift.set_field!(data_page_header.data_page_header, :definition_level_encoding, PAR2.Encoding.RLE)
+    Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE)
 
-        position_before_page_header_write = position(fileio)
-        write_thrift(fileio, data_page_header)
-        size_of_page_header_defn_repn = position(fileio) - position_before_page_header_write
+    position_before_page_header_write = position(fileio)
+    write_thrift(fileio, data_page_header)
+    size_of_page_header_defn_repn = position(fileio) - position_before_page_header_write
 
-        # write data
-        write(fileio, compressed_data)
+    # write data
+    write(fileio, compressed_data)
 
-        return (
-            offset = position_before_page_header_write,
-            uncompressed_size = uncompressed_page_size + size_of_page_header_defn_repn,
-            compressed_size = compressed_page_size + size_of_page_header_defn_repn,
-        )
-    elseif encoding == PAR2.Encoding.PLAIN_DICTIONARY
-        error("not implemented yet")
-        """Dictionary encoding"""
-        rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
-        repeated_value = UInt8(1)
+    return (
+        offset = position_before_page_header_write,
+        uncompressed_size = uncompressed_page_size + size_of_page_header_defn_repn,
+        compressed_size = compressed_page_size + size_of_page_header_defn_repn,
+    )
+end
 
-        encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value))
+function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val(PAR2.Encoding.PLAIN_DICTIONARY)) where T
+    error("PLAIN_DICTIONARY encoding not implemented yet")
+    """Dictionary encoding"""
+    rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
+    repeated_value = UInt8(1)
 
-        ## write the encoded data length
-        write(fileio, encoded_defn_data_length)
+    encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value))
 
-        write(fileio, rle_header)
-        write(fileio, repeated_value)
+    ## write the encoded data length
+    write(fileio, encoded_defn_data_length)
 
-        position(fileio)
+    write(fileio, rle_header)
+    write(fileio, repeated_value)
 
-        # write the data
+    position(fileio)
 
-        ## firstly, bit pack it
-        colvals
+    # write the data
 
-        # the bitwidth to use
-        bitwidth = ceil(UInt8, log(2, length(uvals)))
-        # the max bitwidth is 32 according to documentation
-        @assert bitwidth <= 32
-        # to do that I have to figure out the Dictionary index of it
-        # build a JuliaDict
-        val_index_dict = Dict(zip(uvals, 1:length(uvals)))
+    ## firstly, bit pack it
+    colvals
 
-        bitwidth_mask = UInt32(2^bitwidth-1)
+    # the bitwidth to use
+    bitwidth = ceil(UInt8, log(2, length(uvals)))
+    # the max bitwidth is 32 according to documentation
+    @assert bitwidth <= 32
+    # to do that I have to figure out the Dictionary index of it
+    # build a JuliaDict
+    val_index_dict = Dict(zip(uvals, 1:length(uvals)))
 
-        bytes_needed = ceil(Int, bitwidth*length(colvals) / 8)
+    bitwidth_mask = UInt32(2^bitwidth-1)
 
-        bit_packed_encoded_data = zeros(UInt8, bytes_needed)
-        upto_byte = 1
+    bytes_needed = ceil(Int, bitwidth*length(colvals) / 8)
 
-        bits_written = 0
-        bitsz = 8sizeof(UInt8)
+    bit_packed_encoded_data = zeros(UInt8, bytes_needed)
+    upto_byte = 1
 
-        for val in colvals
-            bit_packed_val = UInt32(val_index_dict[val]) & bitwidth_mask
-            if bitwidth_mask <= bitsz - bits_written
-                bit_packed_encoded_data[upto_byte] = (bit_packed_encoded_data[upto_byte] << bitwidth_mask) | bit_packed_val
-            else
-                # this must mean
-                # bitwidth_mask > bitsz - bits_written
-                # if the remaining bits is not enough to write a packed number
-                42
-            end
+    bits_written = 0
+    bitsz = 8sizeof(UInt8)
+
+    for val in colvals
+        bit_packed_val = UInt32(val_index_dict[val]) & bitwidth_mask
+        if bitwidth_mask <= bitsz - bits_written
+            bit_packed_encoded_data[upto_byte] = (bit_packed_encoded_data[upto_byte] << bitwidth_mask) | bit_packed_val
+        else
+            # this must mean
+            # bitwidth_mask > bitsz - bits_written
+            # if the remaining bits is not enough to write a packed number
+            42
         end
-    else
-        error("Page encoding $encoding is yet not implemented.")
     end
 end
 
+function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, encoding) where T
+    error("Page encoding $encoding is yet not implemented.")
+end
+
 write_col(fileio, colvals::CategoricalArray, args...; kwars...) = begin
     throw("Currently CategoricalArrays are not supported.")
 end

From 620b0f907bcf8ae06bebe4ba8030fc179409b245 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 15:23:54 +1000
Subject: [PATCH 13/52] created a write encoded data and write definition
 functions

---
 src/writer.jl       | 191 +++++++++++++++++++++++++-------------------
 test/test_writer.jl |  21 +++--
 2 files changed, 120 insertions(+), 92 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index 730f890..39a78b0 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -9,6 +9,12 @@ using LittleEndianBase128
 using Base.Iterators: partition
 using CategoricalArrays: CategoricalArray, CategoricalValue
 
+if VERSION < v"1.3"
+    using Missings: SkipMissing
+else
+    using Base: SkipMissing
+end
+
 # a mapping of Julia types to _Type codes in Parquet format
 const COL_TYPE_CODE = Dict{DataType, Int32}(
     Bool => PAR2._Type.BOOLEAN,
@@ -67,6 +73,44 @@ function compress_using_codec(colvals::AbstractVector{String}, codec::Int)::Vect
     return compress_using_codec(uncompressed_bytes, codec)
 end
 
+function write_defn_levels(data_to_compress_io, colvals::AbstractVector{Union{Missing, T}}) where T
+    """ A function to write definition levels for `Union{Missing, T}`"""
+    # if there is missing
+    # use the bit packing algorithm to write the
+    # definition_levels
+    bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8))
+    tmp = UInt32((UInt32(bytes_needed) << 1) | 1)
+    bitpacking_header = LittleEndianBase128.encode(tmp)
+
+    tmpio = IOBuffer()
+    not_missing_bits::BitArray = .!ismissing.(colvals)
+    write(tmpio, not_missing_bits)
+    seek(tmpio, 0)
+
+    encoded_defn_data = read(tmpio, bytes_needed)
+
+    encoded_defn_data_length = length(bitpacking_header) + bytes_needed
+    # write the definition data
+    write(data_to_compress_io, UInt32(encoded_defn_data_length))
+    write(data_to_compress_io, bitpacking_header)
+    write(data_to_compress_io, encoded_defn_data)
+end
+
+function write_defn_levels(data_to_compress_io, colvals::AbstractVector)
+    """ A function to write definition levels for NON-missing data
+    """
+    # if there is no missing can just use RLE of one
+    # using rle
+    rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
+    repeated_value = UInt8(1)
+    encoded_defn_data_length = sizeof(rle_header) + sizeof(repeated_value)
+
+    # write the definition data
+    write(data_to_compress_io, UInt32(encoded_defn_data_length))
+    write(data_to_compress_io, rle_header)
+    write(data_to_compress_io, repeated_value)
+end
+
 function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T
     """ write the column dictionary page """
     # note: `level`s does not return `missing` as a level
@@ -119,80 +163,69 @@ function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T
     return (offset = before_write_page_header_pos, uncompressed_size = uncompressed_dict_size + dict_page_header_size, compressed_size = compressed_dict_size + dict_page_header_size)
 end
 
-# TODO set the encoding code into a dictionary
-function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.Encoding.PLAIN}) where T
-    """write a chunk of data into a data page using PLAIN encoding"""
 
-    # generate the data page header
-    data_page_header = PAR2.PageHeader()
+write_encoded_data(data_to_compress_io, colvals::AbstractVector{Union{Missing, T}}) where T =
+    write_encoded_data(data_to_compress_io, skipmissing(colvals))
 
-    # write repetition level data
-    # do nothing
-    # this seems to be related to nested columns
-    # and hence is not needed here
+function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{String}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, String}}
+    # write the values
+    for val in colvals
+        # for string it needs to be stored as BYTE_ARRAY which needs the length
+        # to be the first 4 bytes UInt32
+        write(data_to_compress_io, val |> sizeof |> UInt32)
+        # write each of the strings one after another
+        write(data_to_compress_io, val)
+    end
+end
 
-    # set up a buffer to write to
-    data_to_compress_io = IOBuffer()
+function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{Bool}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, Bool}}
+    # write the bitacpked bits
+    # write a bitarray seems to write 8 bytes at a time
+    # so write to a tmpio first
+    no_missing_bit_vec =  BitArray(colvals)
+    bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8))
+    tmpio = IOBuffer()
+    write(tmpio, no_missing_bit_vec)
+    seek(tmpio, 0)
+    packed_bits = read(tmpio, bytes_needed)
+    write(data_to_compress_io, packed_bits)
+end
 
-    if Missing <: T
-        # if there is missing
-        # use the bit packing algorithm to write the
-        # definition_levels
+function write_encoded_data(data_to_compress_io, colvals::AbstractArray)
+    write(data_to_compress_io, val)
+end
 
-        bytes_needed = ceil(Int, length(colvals) / 8sizeof(UInt8))
-        tmp = UInt32((UInt32(bytes_needed) << 1) | 1)
-        bitpacking_header = LittleEndianBase128.encode(tmp)
+function write_encoded_data(data_to_compress_io, colvals::SkipMissing)
+    for val in colvals
+        write(data_to_compress_io, val)
+    end
+end
 
-        tmpio = IOBuffer()
-        not_missing_bits::BitArray = .!ismissing.(colvals)
-        write(tmpio, not_missing_bits)
-        seek(tmpio, 0)
+# TODO set the encoding code into a dictionary
+function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN})
+    """
+    Write a chunk of data into a data page using PLAIN encoding where the values
+    are written back-to-back in memory and then compressed with the codec.
+    For `String`s, the values are written with length (UInt32), followed by
+    content; it is NOT null terminated.
+    """
 
-        encoded_defn_data = read(tmpio, bytes_needed)
+    # generate the data page header
+    data_page_header = PAR2.PageHeader()
 
-        encoded_defn_data_length = length(bitpacking_header) + bytes_needed
-        # write the definition data
-        write(data_to_compress_io, UInt32(encoded_defn_data_length))
-        write(data_to_compress_io, bitpacking_header)
-        write(data_to_compress_io, encoded_defn_data)
-    else
-        # if there is no missing can just use RLE of one
-        # using rle
-        rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
-        repeated_value = UInt8(1)
-        encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value))
-
-        # write the definition data
-        write(data_to_compress_io, UInt32(encoded_defn_data_length))
-        write(data_to_compress_io, rle_header)
-        write(data_to_compress_io, repeated_value)
-    end
+    # set up an IO buffer to write to
+    data_to_compress_io = IOBuffer()
 
-    if nonmissingtype(T) == String
-        # write the values
-        for val in skipmissing(colvals)
-            # for string it needs to be stored as BYTE_ARRAY which needs the length
-            # to be the first 4 bytes UInt32
-            write(data_to_compress_io, val |> sizeof |> UInt32)
-            # write each of the strings one after another
-            write(data_to_compress_io, val)
-        end
-    elseif nonmissingtype(T) == Bool
-        # write the bitacpked bits
-        # write a bitarray seems to write 8 bytes at a time
-        # so write to a tmpio first
-        no_missing_bit_vec =  BitArray(skipmissing(colvals))
-        bytes_needed = ceil(Int, length(no_missing_bit_vec) / 8sizeof(UInt8))
-        tmpio = IOBuffer()
-        write(tmpio, no_missing_bit_vec)
-        seek(tmpio, 0)
-        packed_bits = read(tmpio, bytes_needed)
-        write(data_to_compress_io, packed_bits)
-    else
-        for val in skipmissing(colvals)
-            write(data_to_compress_io, val)
-        end
-    end
+    # write repetition level data
+    ## do nothing
+    ## this seems to be related to nested columns
+    ## and hence is not needed here as we only supported unnested column write
+
+    # write definition levels
+    write_defn_levels(data_to_compress_io, colvals)
+
+    # write the encoded data
+    write_encoded_data(data_to_compress_io, colvals)
 
     data_to_compress::Vector{UInt8} = take!(data_to_compress_io)
 
@@ -210,7 +243,7 @@ function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.En
 
     Thrift.set_field!(data_page_header, :data_page_header, PAR2.DataPageHeader())
     Thrift.set_field!(data_page_header.data_page_header, :num_values , Int32(length(colvals)))
-    Thrift.set_field!(data_page_header.data_page_header, :encoding , encoding) # encoding 0 is plain encoding
+    Thrift.set_field!(data_page_header.data_page_header, :encoding , PAR2.Encoding.PLAIN)
     Thrift.set_field!(data_page_header.data_page_header, :definition_level_encoding, PAR2.Encoding.RLE)
     Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE)
 
@@ -228,9 +261,11 @@ function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val{PAR2.En
     )
 end
 
-function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val(PAR2.Encoding.PLAIN_DICTIONARY)) where T
+function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN_DICTIONARY})
+    """write Dictionary encoding data page"""
     error("PLAIN_DICTIONARY encoding not implemented yet")
-    """Dictionary encoding"""
+
+    # TODO finish the implementation
     rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
     repeated_value = UInt8(1)
 
@@ -247,7 +282,6 @@ function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, ::Val(PAR2.En
     # write the data
 
     ## firstly, bit pack it
-    colvals
 
     # the bitwidth to use
     bitwidth = ceil(UInt8, log(2, length(uvals)))
@@ -291,23 +325,18 @@ end
 function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; num_chunks = 1) where T
     """Write a column to a file"""
     # TODO turn writing dictionary on
-    if false
-        if nonmissingtype(T) == Bool
-            # dictionary type are not supported for
-            dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0)
-        else
-            dict_info = write_col_dict(fileio, colvals, codec)
-        end
-    else
-        # return offset of -1 means that dict_info
-        dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0)
-    end
+    # Currently, writing the dictionary page is not turned on for any type.
+    # Normally, for Boolean data, dictionary is not supported. However for other
+    # data types, dictionary page CAN be supported. However, since Parquet.jl
+    # only supports writing PLAIN encoding data, hence there is no need to write
+    # a dictionary page until other dictionary-based encodings are supported
+    dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0)
 
     num_vals_per_chunk = ceil(Int, length(colvals) / num_chunks)
 
     # TODO choose an encoding
     # TODO put encoding into a dictionary
-    chunk_info = [write_col_chunk(fileio, val_chunk, codec, encoding) for val_chunk in partition(colvals, num_vals_per_chunk)]
+    chunk_info = [write_col_chunk(fileio, val_chunk, codec, Val(encoding)) for val_chunk in partition(colvals, num_vals_per_chunk)]
 
     sizes = reduce(chunk_info; init = dict_info) do x, y
         (
diff --git a/test/test_writer.jl b/test/test_writer.jl
index dbee089..bfb93b7 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -10,26 +10,27 @@ Random.seed!(1234567)
 
 function test_write()
     tbl = (
-        int32 = Int32.(1:1000),
-        int64 = Int64.(1:1000),
-        float32 = Float32.(1:1000),
-        float64 = Float64.(1:1000),
+        int32 = rand(Int32, 1000),
+        int64 = rand(Int64, 1000),
+        float32 = rand(Float32, 1000),
+        float64 = rand(Float64, 1000),
         bool = rand(Bool, 1000),
         string = [randstring(8) for i in 1:1000],
-        int32m = rand([missing, 1:100...], 1000),
-        int64m = rand([missing, 1:100...], 1000),
-        float32m = rand([missing, Float32.(1:100)...], 1000),
-        float64m = rand([missing, Float64.(1:100)...], 1000),
+        int32m = rand([missing, rand(Int32, 10)...], 1000),
+        int64m = rand([missing, rand(Int64, 10)...], 1000),
+        float32m = rand([missing, rand(Float32, 10)...], 1000),
+        float64m = rand([missing, rand(Float64, 10)...], 1000),
         boolm = rand([missing, true, false], 1000),
         stringm = rand([missing, "abc", "def", "ghi"], 1000)
     )
 
     tmpfile = tempname()*".parquet"
+
     write_parquet(tmpfile, tbl)
 
     pf = ParFile(tmpfile)
 
-    # the file is very smalll so only one rowgroup
+    # the file is very small so only one rowgroup
     col_chunks = columns(pf, 1)
 
     for colnum in 1:length(col_chunks)
@@ -49,8 +50,6 @@ function test_write()
 
     # clean up
     close(pf)
-
-    #rm(tmpfile)
 end
 
 test_write()

From 53382e270021ccee7ef35ab77f394ed1378cec3f Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 15:26:56 +1000
Subject: [PATCH 14/52] minor bug fix

---
 src/writer.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/writer.jl b/src/writer.jl
index 39a78b0..2104dec 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -192,7 +192,8 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{B
 end
 
 function write_encoded_data(data_to_compress_io, colvals::AbstractArray)
-    write(data_to_compress_io, val)
+    @assert isbitstype(eltype(colvals))
+    write(data_to_compress_io, colvals)
 end
 
 function write_encoded_data(data_to_compress_io, colvals::SkipMissing)

From 1bc2addd00aaa634f59894c3ff077af62d603758 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 19:03:41 +1000
Subject: [PATCH 15/52] fixed Julia 1.0.5 issue

---
 src/writer.jl       | 6 +++---
 test/test_writer.jl | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index 2104dec..c5a99ea 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -9,10 +9,10 @@ using LittleEndianBase128
 using Base.Iterators: partition
 using CategoricalArrays: CategoricalArray, CategoricalValue
 
+using Base: SkipMissing
+
 if VERSION < v"1.3"
-    using Missings: SkipMissing
-else
-    using Base: SkipMissing
+    using Missings: nonmissingtype
 end
 
 # a mapping of Julia types to _Type codes in Parquet format
diff --git a/test/test_writer.jl b/test/test_writer.jl
index bfb93b7..24172f1 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -6,6 +6,8 @@ if VERSION < v"1.3"
     using Missings: nonmissingtype
 end
 
+using Base.SkipMissing
+
 Random.seed!(1234567)
 
 function test_write()

From 5ebe14288e98fbe6daaa4fb33265c29450797433 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 19:04:24 +1000
Subject: [PATCH 16/52] minor bug fix

---
 test/test_writer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index 24172f1..1976d7c 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -6,7 +6,7 @@ if VERSION < v"1.3"
     using Missings: nonmissingtype
 end
 
-using Base.SkipMissing
+using Base: SkipMissing
 
 Random.seed!(1234567)
 

From 1f02847025f1cd7feee039d6118ca6a2adf86b0f Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 19:06:00 +1000
Subject: [PATCH 17/52] removed minor

---
 test/test_writer.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index 1976d7c..bfb93b7 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -6,8 +6,6 @@ if VERSION < v"1.3"
     using Missings: nonmissingtype
 end
 
-using Base: SkipMissing
-
 Random.seed!(1234567)
 
 function test_write()

From 7652a87ddb61754887f77c9e6d4cabc50b0ddca1 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 19:13:19 +1000
Subject: [PATCH 18/52] most general form of write_encoded_data

---
 src/writer.jl | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/writer.jl b/src/writer.jl
index c5a99ea..f4aeadb 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -168,6 +168,7 @@ write_encoded_data(data_to_compress_io, colvals::AbstractVector{Union{Missing, T
     write_encoded_data(data_to_compress_io, skipmissing(colvals))
 
 function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{String}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, String}}
+    """ Write encoded data for String type """
     # write the values
     for val in colvals
         # for string it needs to be stored as BYTE_ARRAY which needs the length
@@ -179,6 +180,7 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{S
 end
 
 function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{Bool}, SkipMissing{S}}) where S <: AbstractVector{Union{Missing, Bool}}
+    """ Write encoded data for Bool type """
     # write the bitacpked bits
     # write a bitarray seems to write 8 bytes at a time
     # so write to a tmpio first
@@ -192,16 +194,27 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{B
 end
 
 function write_encoded_data(data_to_compress_io, colvals::AbstractArray)
+    """ Efficient write of encoded data for `isbits` types"""
     @assert isbitstype(eltype(colvals))
     write(data_to_compress_io, colvals)
 end
 
 function write_encoded_data(data_to_compress_io, colvals::SkipMissing)
+    """ Write of encoded data for skipped missing types"""
     for val in colvals
         write(data_to_compress_io, val)
     end
 end
 
+function write_encoded_data(data_to_compress_io, colvals)
+    """ Write of encoded data for the most general type.
+    The only requirement is that colvals has to be iterable
+    """
+    for val in skipmissing(colvals)
+        write(data_to_compress_io, val)
+    end
+end
+
 # TODO set the encoding code into a dictionary
 function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN})
     """

From a4e3ffec546acb14f916947b9da1401cd4db0927 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 21:27:16 +1000
Subject: [PATCH 19/52] refactored into internal methods

---
 src/writer.jl | 160 ++++++++++++++++++++++++++++----------------------
 1 file changed, 90 insertions(+), 70 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index f4aeadb..73b8313 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -216,7 +216,7 @@ function write_encoded_data(data_to_compress_io, colvals)
 end
 
 # TODO set the encoding code into a dictionary
-function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN})
+function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN})
     """
     Write a chunk of data into a data page using PLAIN encoding where the values
     are written back-to-back in memory and then compressed with the codec.
@@ -275,7 +275,7 @@ function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encod
     )
 end
 
-function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN_DICTIONARY})
+function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encoding.PLAIN_DICTIONARY})
     """write Dictionary encoding data page"""
     error("PLAIN_DICTIONARY encoding not implemented yet")
 
@@ -328,7 +328,7 @@ function write_col_chunk(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encod
     end
 end
 
-function write_col_chunk(fileio, colvals::AbstractArray{T}, codec, encoding) where T
+function write_col_page(fileio, colvals::AbstractArray{T}, codec, encoding) where T
     error("Page encoding $encoding is yet not implemented.")
 end
 
@@ -336,7 +336,7 @@ write_col(fileio, colvals::CategoricalArray, args...; kwars...) = begin
     throw("Currently CategoricalArrays are not supported.")
 end
 
-function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; num_chunks = 1) where T
+function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec; nchunks = 1) where T
     """Write a column to a file"""
     # TODO turn writing dictionary on
     # Currently, writing the dictionary page is not turned on for any type.
@@ -346,11 +346,9 @@ function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec;
     # a dictionary page until other dictionary-based encodings are supported
     dict_info = (offset = missing, uncompressed_size = 0, compressed_size = 0)
 
-    num_vals_per_chunk = ceil(Int, length(colvals) / num_chunks)
+    num_vals_per_chunk = ceil(Int, length(colvals) / nchunks)
 
-    # TODO choose an encoding
-    # TODO put encoding into a dictionary
-    chunk_info = [write_col_chunk(fileio, val_chunk, codec, Val(encoding)) for val_chunk in partition(colvals, num_vals_per_chunk)]
+    chunk_info = [write_col_page(fileio, val_chunk, codec, Val(encoding)) for val_chunk in partition(colvals, num_vals_per_chunk)]
 
     sizes = reduce(chunk_info; init = dict_info) do x, y
         (
@@ -359,13 +357,53 @@ function write_col(fileio, colvals::AbstractArray{T}, colname, encoding, codec;
         )
     end
 
+    # write the column metadata
+    # can probably write the metadata right after the data chunks
+    col_meta = PAR2.ColumnMetaData()
+
+    Thrift.set_field!(col_meta, :_type, COL_TYPE_CODE[eltype(colvals) |> nonmissingtype])
+    # these are all the fields
+    # TODO collect all the encodings used
+    if eltype(colvals) == Bool
+        Thrift.set_field!(col_meta, :encodings, Int32[0, 3])
+    else
+        Thrift.set_field!(col_meta, :encodings, Int32[2, 0, 3])
+    end
+    Thrift.set_field!(col_meta, :path_in_schema, [colname])
+    Thrift.set_field!(col_meta, :codec, codec)
+    Thrift.set_field!(col_meta, :num_values, length(colvals))
+
+    Thrift.set_field!(col_meta, :total_uncompressed_size, sizes.uncompressed_size)
+    Thrift.set_field!(col_meta, :total_compressed_size, sizes.compressed_size)
+
+    Thrift.set_field!(col_meta, :data_page_offset, chunk_info[1].offset)
+    if !ismissing(dict_info.offset)
+        Thrift.set_field!(col_meta, :dictionary_page_offset, dict_info.offset)
+    end
+
+    # write the column meta data right after the data
+    # keep track of the position so it can put into the column chunk
+    # metadata
+    col_meta_offset = position(fileio)
+    write_thrift(fileio, col_meta)
+
+    # Prep metadata for the filemetadata
+    ## column chunk metadata
+    col_chunk_meta = PAR2.ColumnChunk()
+
+    Thrift.set_field!(col_chunk_meta, :file_offset, col_meta_offset)
+    Thrift.set_field!(col_chunk_meta, :meta_data, col_meta)
+    Thrift.clear(col_chunk_meta, :offset_index_offset)
+    Thrift.clear(col_chunk_meta, :offset_index_length)
+    Thrift.clear(col_chunk_meta, :column_index_offset)
+    Thrift.clear(col_chunk_meta, :column_index_length)
+
     return (
-        dictionary_page_offset = dict_info.offset,
         data_page_offset = chunk_info[1].offset,
-        uncompressed_size = sizes.uncompressed_size,
-        compressed_size = sizes.compressed_size,
+        dictionary_page_offset =  dict_info.offset,
+        col_chunk_meta = col_chunk_meta,
+        col_meta_offset = col_meta_offset
     )
-
 end
 
 function create_schema_parent_node(ncols)
@@ -446,20 +484,6 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
     # convert a string or symbol compression codec into the numeric code
     codec = getproperty(PAR2.CompressionCodec, Symbol(uppercase(string(compression_codec))))
 
-    fileio = open(path, "w")
-    write(fileio, "PAR1")
-
-    colnames = Tables.columnnames(tbl)
-    ncols = length(colnames)
-    nrows = length(Tables.rows(tbl))
-
-    # the + 1 comes from the fact that schema is tree and there is an extra
-    # parent node
-    schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1)
-    schemas[1] = create_schema_parent_node(ncols)
-    col_chunk_metas = Vector{PAR2.ColumnChunk}(undef, ncols)
-    row_group_file_offset = missing
-
     # figure out the right number of chunks
     # TODO test that it works for all supported table
     table_size_bytes = Base.summarysize(tbl)
@@ -467,6 +491,9 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
     approx_raw_to_parquet_compression_ratio = 6
     approx_post_compression_size = (table_size_bytes / 2^30) / approx_raw_to_parquet_compression_ratio
 
+    colnames = String.(Tables.columnnames(tbl))
+    nrows = length(Tables.rows(tbl))
+
     # if size is larger than 64mb and has more than 6 rows
     if (approx_post_compression_size > 0.064) & (nrows > 6)
         recommended_chunks = ceil(Int, approx_post_compression_size / 6) * 6
@@ -474,12 +501,46 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
         recommended_chunks = 1
     end
 
+    _write_parquet(
+        tbl,
+        path,
+        recommended_chunks;
+        encoding    =   Dict(String(col)=>encoding for col in colnames),
+        codec       =   Dict(String(col)=>codec for col in colnames)
+    )
+end
+
+function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32})
+    """Internal method for writing parquet
+
+    tbl - Expected to be a Tables.jl compatible table
+    path - The output parquet file path
+
+    """
+    fileio = open(path, "w")
+    write(fileio, "PAR1")
+
+    colnames = Tables.columnnames(tbl)
+    ncols = length(colnames)
+    nrows = length(Tables.rows(tbl))
+
+    # the + 1 comes from the fact that schema is tree and there is an extra
+    # parent node
+    schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1)
+    schemas[1] = create_schema_parent_node(ncols)
+    col_chunk_metas = Vector{PAR2.ColumnChunk}(undef, ncols)
+    row_group_file_offset = missing
+
+    # write the columns one by one
+    # TODO parallelize this
     for (coli, colname_sym) in enumerate(colnames)
         colvals = Tables.getcolumn(tbl, colname_sym)
         colname = String(colname_sym)
 
-        # write the data
-        col_info = write_col(fileio, colvals, colname, encoding, codec; num_chunks = recommended_chunks)
+        col_encoding = encoding[colname]
+        col_codec = codec[colname]
+        # write the data including metadata
+        col_info = write_col(fileio, colvals, colname, col_encoding, col_codec; nchunks = nchunks)
 
         # the `row_group_file_offset` keeps track where the data
         # starts, so keep it at the dictonary of the first data
@@ -491,48 +552,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
             end
         end
 
-        # write the column metadata
-        # can probably write the metadata right after the data chunks
-        col_meta = PAR2.ColumnMetaData()
-
-        Thrift.set_field!(col_meta, :_type, COL_TYPE_CODE[eltype(colvals) |> nonmissingtype])
-        # these are all the fields
-        # TODO collect all the encodings used
-        if eltype(colvals) == Bool
-            Thrift.set_field!(col_meta, :encodings, Int32[0, 3])
-        else
-            Thrift.set_field!(col_meta, :encodings, Int32[2, 0, 3])
-        end
-        Thrift.set_field!(col_meta, :path_in_schema, [colname])
-        Thrift.set_field!(col_meta, :codec, codec)
-        Thrift.set_field!(col_meta, :num_values, length(colvals))
-
-        Thrift.set_field!(col_meta, :total_uncompressed_size, col_info.uncompressed_size)
-        Thrift.set_field!(col_meta, :total_compressed_size, col_info.compressed_size)
-
-        Thrift.set_field!(col_meta, :data_page_offset, col_info.data_page_offset)
-        if !ismissing(col_info.dictionary_page_offset)
-            Thrift.set_field!(col_meta, :dictionary_page_offset, col_info.dictionary_page_offset)
-        end
-
-        # write the column meta data right after the data
-        # keep track of the position so it can put into the column chunk
-        # metadata
-        col_meta_offset = position(fileio)
-        write_thrift(fileio, col_meta)
-
-        # Prep metadata for the filemetadata
-        ## column chunk metadata
-        col_chunk_meta = PAR2.ColumnChunk()
-
-        Thrift.set_field!(col_chunk_meta, :file_offset, col_meta_offset)
-        Thrift.set_field!(col_chunk_meta, :meta_data, col_meta)
-        Thrift.clear(col_chunk_meta, :offset_index_offset)
-        Thrift.clear(col_chunk_meta, :offset_index_length)
-        Thrift.clear(col_chunk_meta, :column_index_offset)
-        Thrift.clear(col_chunk_meta, :column_index_length)
-
-        col_chunk_metas[coli] = col_chunk_meta
+        col_chunk_metas[coli] = col_info.col_chunk_meta
 
         # add the schema
         schemas[coli + 1] = create_col_schema(eltype(colvals) |> nonmissingtype, colname)

From 06fb6996bbdab20ff8b11f6f3546d0e231447b18 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 21:29:09 +1000
Subject: [PATCH 20/52] minor for clarity

---
 src/writer.jl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index 73b8313..a9b48c1 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -491,16 +491,15 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
     approx_raw_to_parquet_compression_ratio = 6
     approx_post_compression_size = (table_size_bytes / 2^30) / approx_raw_to_parquet_compression_ratio
 
-    colnames = String.(Tables.columnnames(tbl))
-    nrows = length(Tables.rows(tbl))
-
     # if size is larger than 64mb and has more than 6 rows
+    nrows = length(Tables.rows(tbl))
     if (approx_post_compression_size > 0.064) & (nrows > 6)
         recommended_chunks = ceil(Int, approx_post_compression_size / 6) * 6
     else
         recommended_chunks = 1
     end
 
+    colnames = String.(Tables.columnnames(tbl))
     _write_parquet(
         tbl,
         path,

From c0bd4d0f2cdc602e8aaf83af8ef54174387a9eac Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 16 May 2020 21:32:24 +1000
Subject: [PATCH 21/52] minor update

---
 src/writer.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index a9b48c1..6aac875 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -504,8 +504,8 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
         tbl,
         path,
         recommended_chunks;
-        encoding    =   Dict(String(col)=>encoding for col in colnames),
-        codec       =   Dict(String(col)=>codec for col in colnames)
+        encoding    =   Dict(col => encoding for col in colnames),
+        codec       =   Dict(col => codec for col in colnames)
     )
 end
 
@@ -514,6 +514,9 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec
 
     tbl - Expected to be a Tables.jl compatible table
     path - The output parquet file path
+    nchunks - The number of chunks/pages to write the columns
+    encoding - A dictionary mapping from column names to encoding
+    codec - A dictionary mapping from column names to compressoin codec
 
     """
     fileio = open(path, "w")

From 774bb4c3384d26f6123167e2c3a735a1a5f0ce24 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 18 May 2020 11:38:30 +1000
Subject: [PATCH 22/52] fixed all comments

---
 src/reader.jl |   4 --
 src/writer.jl | 152 +++++++++++++++++++++++++-------------------------
 2 files changed, 77 insertions(+), 79 deletions(-)

diff --git a/src/reader.jl b/src/reader.jl
index 9c94622..9150e05 100644
--- a/src/reader.jl
+++ b/src/reader.jl
@@ -76,10 +76,6 @@ function close(par::ParFile)
     close(par.handle)
 end
 
-function Base.close(par::ParFile)
-    close(par.handle)
-end
-
 ##
 # layer 1 access
 # can access raw (uncompressed) bytes from pages
diff --git a/src/writer.jl b/src/writer.jl
index 6aac875..3d0ce5b 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -29,8 +29,14 @@ const COL_TYPE_CODE = Dict{DataType, Int32}(
 
 function write_thrift(fileio, thrift_obj)
     """write thrift definition to file"""
+    pos_before_write = position(fileio)
     p = TCompactProtocol(TFileTransport(fileio))
     Thrift.write(p, thrift_obj)
+    pos_after_write = position(fileio)
+
+    size_of_written = pos_after_write - pos_before_write
+
+    size_of_written
 end
 
 function compress_using_codec(colvals::AbstractArray, codec::Integer)::Vector{UInt8}
@@ -111,57 +117,56 @@ function write_defn_levels(data_to_compress_io, colvals::AbstractVector)
     write(data_to_compress_io, repeated_value)
 end
 
-function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T
-    """ write the column dictionary page """
-    # note: `level`s does not return `missing` as a level
-    uvals = DataAPI.levels(colvals)
-
-    # do not support dictionary with more than 127 levels
-    # TODO relax this 127 restriction
-    if length(uvals) > 127
-        @warn "More than 127 levels in dictionary. Parquet.jl does not support this at this stage."
-        return (offset = missing, uncompressed_size = 0, compressed_size = 0)
-    end
-
-    if nonmissingtype(T) == String
-        # the raw bytes of made of on UInt32 to indicate string length
-        # and the content of the string
-        # so the formula for dict size is as below
-        uncompressed_dict_size = sizeof(UInt32)*length(uvals) + sum(sizeof, uvals)
-    else
-        uncompressed_dict_size = length(uvals)*sizeof(eltype(uvals))
-    end
-
-    compressed_uvals::Vector{UInt8} = compress_using_codec(uvals, codec)
-    compressed_dict_size = length(compressed_uvals)
-
-    # TODO do the CRC properly
-    crc = 0
-
-    # construct dictionary metadata
-    dict_page_header = PAR2.PageHeader()
-
-    Thrift.set_field!(dict_page_header, :_type, PAR2.PageType.DICTIONARY_PAGE)
-    Thrift.set_field!(dict_page_header, :uncompressed_page_size , uncompressed_dict_size)
-    Thrift.set_field!(dict_page_header, :compressed_page_size , compressed_dict_size)
-    Thrift.set_field!(dict_page_header, :crc , crc)
-
-    Thrift.set_field!(dict_page_header, :dictionary_page_header, PAR2.DictionaryPageHeader())
-    Thrift.set_field!(dict_page_header.dictionary_page_header, :num_values , Int32(length(uvals)))
-    Thrift.set_field!(dict_page_header.dictionary_page_header, :encoding , PAR2.Encoding.PLAIN_DICTIONARY)
-    Thrift.set_field!(dict_page_header.dictionary_page_header, :is_sorted , false)
-
-    before_write_page_header_pos = position(fileio)
-
-    write_thrift(fileio, dict_page_header)
-
-    dict_page_header_size = position(fileio) - before_write_page_header_pos
-
-    # write the dictionary data
-    write(fileio, compressed_uvals)
-
-    return (offset = before_write_page_header_pos, uncompressed_size = uncompressed_dict_size + dict_page_header_size, compressed_size = compressed_dict_size + dict_page_header_size)
-end
+# TODO turn this on when writing dictionary is necessary
+# function write_col_dict(fileio, colvals::AbstractArray{T}, codec) where T
+#     """ write the column dictionary page """
+#     # note: `level`s does not return `missing` as a level
+#     uvals = DataAPI.levels(colvals)
+#
+#     # do not support dictionary with more than 127 levels
+#     # TODO relax this 127 restriction
+#     if length(uvals) > 127
+#         @warn "More than 127 levels in dictionary. Parquet.jl does not support this at this stage."
+#         return (offset = missing, uncompressed_size = 0, compressed_size = 0)
+#     end
+#
+#     if nonmissingtype(T) == String
+#         # the raw bytes of made of on UInt32 to indicate string length
+#         # and the content of the string
+#         # so the formula for dict size is as below
+#         uncompressed_dict_size = sizeof(UInt32)*length(uvals) + sum(sizeof, uvals)
+#     else
+#         uncompressed_dict_size = length(uvals)*sizeof(eltype(uvals))
+#     end
+#
+#     compressed_uvals::Vector{UInt8} = compress_using_codec(uvals, codec)
+#     compressed_dict_size = length(compressed_uvals)
+#
+#     # TODO do the CRC properly
+#     crc = 0
+#
+#     # construct dictionary metadata
+#     dict_page_header = PAR2.PageHeader()
+#
+#     Thrift.set_field!(dict_page_header, :_type, PAR2.PageType.DICTIONARY_PAGE)
+#     Thrift.set_field!(dict_page_header, :uncompressed_page_size , uncompressed_dict_size)
+#     Thrift.set_field!(dict_page_header, :compressed_page_size , compressed_dict_size)
+#     Thrift.set_field!(dict_page_header, :crc , crc)
+#
+#     Thrift.set_field!(dict_page_header, :dictionary_page_header, PAR2.DictionaryPageHeader())
+#     Thrift.set_field!(dict_page_header.dictionary_page_header, :num_values , Int32(length(uvals)))
+#     Thrift.set_field!(dict_page_header.dictionary_page_header, :encoding , PAR2.Encoding.PLAIN_DICTIONARY)
+#     Thrift.set_field!(dict_page_header.dictionary_page_header, :is_sorted , false)
+#
+#     before_write_page_header_pos = position(fileio)
+#
+#     dict_page_header_size = write_thrift(fileio, dict_page_header)
+#
+#     # write the dictionary data
+#     write(fileio, compressed_uvals)
+#
+#     return (offset = before_write_page_header_pos, uncompressed_size = uncompressed_dict_size + dict_page_header_size, compressed_size = compressed_dict_size + dict_page_header_size)
+# end
 
 
 write_encoded_data(data_to_compress_io, colvals::AbstractVector{Union{Missing, T}}) where T =
@@ -262,8 +267,8 @@ function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encodi
     Thrift.set_field!(data_page_header.data_page_header, :repetition_level_encoding, PAR2.Encoding.RLE)
 
     position_before_page_header_write = position(fileio)
-    write_thrift(fileio, data_page_header)
-    size_of_page_header_defn_repn = position(fileio) - position_before_page_header_write
+
+    size_of_page_header_defn_repn = write_thrift(fileio, data_page_header)
 
     # write data
     write(fileio, compressed_data)
@@ -501,7 +506,8 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
 
     colnames = String.(Tables.columnnames(tbl))
     _write_parquet(
-        tbl,
+        Tables.columns(tbl),
+        Tables.columnnames(tbl),
         path,
         recommended_chunks;
         encoding    =   Dict(col => encoding for col in colnames),
@@ -509,24 +515,23 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
     )
 end
 
-function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32})
+function _write_parquet(itr_vectors, colnames, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32})
     """Internal method for writing parquet
 
-    tbl - Expected to be a Tables.jl compatible table
-    path - The output parquet file path
-    nchunks - The number of chunks/pages to write the columns
-    encoding - A dictionary mapping from column names to encoding
-    codec - A dictionary mapping from column names to compressoin codec
-
+    itr_vectors -   An iterable of `AbstractVector`s containing the values to be
+                    written
+    colnames    -   Column names for each of the vectors
+    path        -   The output parquet file path
+    nchunks     -   The number of chunks/pages to write for each column
+    encoding    -   A dictionary mapping from column names to encoding
+    codec       -   A dictionary mapping from column names to compression codec
     """
     fileio = open(path, "w")
     write(fileio, "PAR1")
 
-    colnames = Tables.columnnames(tbl)
-    ncols = length(colnames)
-    nrows = length(Tables.rows(tbl))
+    ncols = length(itr_vectors)
 
-    # the + 1 comes from the fact that schema is tree and there is an extra
+    # the + 1 comes from the fact that schema is a tree and there is an extra
     # parent node
     schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1)
     schemas[1] = create_schema_parent_node(ncols)
@@ -535,8 +540,8 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec
 
     # write the columns one by one
     # TODO parallelize this
-    for (coli, colname_sym) in enumerate(colnames)
-        colvals = Tables.getcolumn(tbl, colname_sym)
+    nrows = -1 # initialize it
+    for (coli, (colname_sym, colvals)) in enumerate(zip(colnames, itr_vectors))
         colname = String(colname_sym)
 
         col_encoding = encoding[colname]
@@ -544,9 +549,10 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec
         # write the data including metadata
         col_info = write_col(fileio, colvals, colname, col_encoding, col_codec; nchunks = nchunks)
 
-        # the `row_group_file_offset` keeps track where the data
-        # starts, so keep it at the dictonary of the first data
+        # the `row_group_file_offset` keeps track of where the data starts, so
+        # keep it at the dictonary of the first data
         if coli == 1
+            nrows = length(colvals)
             if ismissing(col_info.dictionary_page_offset)
                 row_group_file_offset = col_info.data_page_offset
             else
@@ -584,13 +590,9 @@ function _write_parquet(tbl, path, nchunks; encoding::Dict{String, Int32}, codec
 
     Thrift.set_field!(filemetadata, :row_groups, [row_group])
 
-    position_before_filemetadata_write = position(fileio)
-
-    write_thrift(fileio, filemetadata)
-
-    filemetadata_size = position(fileio) - position_before_filemetadata_write
+    filemetadata_size = write_thrift(fileio, filemetadata)
 
-    write(fileio, Int32(filemetadata_size))
+    write(fileio, UInt32(filemetadata_size))
     write(fileio, "PAR1")
     close(fileio)
 end

From 36fdd327b82de608a8142c38a98ffcbaae72e404 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 18 May 2020 11:47:27 +1000
Subject: [PATCH 23/52] Update writer.jl

---
 src/writer.jl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index 3d0ce5b..3bf213e 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -515,7 +515,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
     )
 end
 
-function _write_parquet(itr_vectors, colnames, path, nchunks; encoding::Dict{String, Int32}, codec::Dict{String, Int32})
+function _write_parquet(itr_vectors, colnames, path, nchunks; ncols = length(itr_vectors), encoding::Dict{String, Int32}, codec::Dict{String, Int32})
     """Internal method for writing parquet
 
     itr_vectors -   An iterable of `AbstractVector`s containing the values to be
@@ -523,14 +523,15 @@ function _write_parquet(itr_vectors, colnames, path, nchunks; encoding::Dict{Str
     colnames    -   Column names for each of the vectors
     path        -   The output parquet file path
     nchunks     -   The number of chunks/pages to write for each column
+    ncols       -   The number of columns. This is provided as an argument for
+                    the case where the `length(itr_vectors)` is not defined,
+                    e.g. lazy loading of remote resources.
     encoding    -   A dictionary mapping from column names to encoding
     codec       -   A dictionary mapping from column names to compression codec
     """
     fileio = open(path, "w")
     write(fileio, "PAR1")
 
-    ncols = length(itr_vectors)
-
     # the + 1 comes from the fact that schema is a tree and there is an extra
     # parent node
     schemas = Vector{PAR2.SchemaElement}(undef, ncols + 1)

From ba78cb8e4f0b974a7a0749abcabc43922a51be60 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 18 May 2020 12:09:50 +1000
Subject: [PATCH 24/52] made version number of package a constant

instead of relying on the directory of the package, because relying on directory makes Parquet.jl static compilation unfriendly.
---
 Project.toml   | 1 -
 src/Parquet.jl | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Project.toml b/Project.toml
index be869a4..4738555 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,7 +14,6 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
-Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProtoBuf = "3349acd9-ac6a-5e09-bcdb-63829b23a429"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
diff --git a/src/Parquet.jl b/src/Parquet.jl
index b7d71df..ed45a0f 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -12,8 +12,7 @@ if VERSION < v"1.3"
     using Missings: nonmissingtype
 end
 
-using Pkg
-const PARQUET_JL_VERSION = VersionNumber(Pkg.TOML.parsefile(joinpath(@__DIR__, "..", "Project.toml"))["version"])
+const PARQUET_JL_VERSION = v"0.4.0"
 
 import Base: show, open, close, values, eltype, length
 import Thrift: isfilled

From 656d503029c73d8e38c664358752f59ba703962d Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 18 May 2020 13:26:33 +1000
Subject: [PATCH 25/52] fixed bug of not writing DataFrame properly

---
 src/writer.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/writer.jl b/src/writer.jl
index 3bf213e..afaadac 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -506,7 +506,7 @@ function write_parquet(path, tbl; compression_codec = "SNAPPY")
 
     colnames = String.(Tables.columnnames(tbl))
     _write_parquet(
-        Tables.columns(tbl),
+        Tables.Columns(tbl),
         Tables.columnnames(tbl),
         path,
         recommended_chunks;

From 7eda104bd1609ea311d2967844fcca749e24445f Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 18 May 2020 13:30:51 +1000
Subject: [PATCH 26/52] updated parquet

---
 src/Parquet.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Parquet.jl b/src/Parquet.jl
index 05e9001..fa2e60e 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -11,7 +11,7 @@ if VERSION < v"1.3"
     using Missings: nonmissingtype
 end
 
-const PARQUET_JL_VERSION = v"0.4.0"
+const PARQUET_JL_VERSION = v"0.5.0"
 
 import Base: show, open, close, values, eltype, length
 import Thrift: isfilled

From 04cca7815dd0340e13d99c4a5be147b796c62276 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 18 May 2020 14:49:08 +1000
Subject: [PATCH 27/52] removed protobuf

---
 Project.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index c96986d..601e254 100644
--- a/Project.toml
+++ b/Project.toml
@@ -26,7 +26,6 @@ DataAPI = "1"
 LittleEndianBase128 = "0.3"
 MemPool = "0.2"
 Missings = "0.3,0.4"
-ProtoBuf = "0.7,0.8"
 Snappy = "0.3"
 Tables = "1"
 Thrift = "0.6,0.7"

From 2a9ff1d9e81da894576db93bec10b49dbfb9d02e Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 18 May 2020 15:48:31 +1000
Subject: [PATCH 28/52] upped version to 0.5.1

---
 Project.toml   | 2 +-
 src/Parquet.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 601e254..2c22e9d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,7 +3,7 @@ uuid = "626c502c-15b0-58ad-a749-f091afb673ae"
 keywords = ["parquet", "julia", "columnar-storage"]
 license = "MIT"
 desc = "Julia implementation of parquet columnar file format reader and writer"
-version = "0.5.0"
+version = "0.5.1"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
diff --git a/src/Parquet.jl b/src/Parquet.jl
index fa2e60e..313f8dd 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -11,7 +11,7 @@ if VERSION < v"1.3"
     using Missings: nonmissingtype
 end
 
-const PARQUET_JL_VERSION = v"0.5.0"
+const PARQUET_JL_VERSION = v"0.5.1"
 
 import Base: show, open, close, values, eltype, length
 import Thrift: isfilled

From a6f2a8a3b4fed27e53ff695f46514d2bdc2c15ec Mon Sep 17 00:00:00 2001
From: tan <tanmaykm@gmail.com>
Date: Mon, 18 May 2020 18:57:23 +0530
Subject: [PATCH 29/52] performace improvements, few fixes

- fix condition for missing column values when row can not be located in a column chunk
- few performance improvements
---
 src/codec.jl       |  61 ++++++++++++-------------
 src/cursor.jl      | 110 ++++++++++++++++++++++++---------------------
 src/reader.jl      |  22 ++++-----
 test/test_codec.jl |   8 +++-
 4 files changed, 104 insertions(+), 97 deletions(-)

diff --git a/src/codec.jl b/src/codec.jl
index 7278ad6..80fd244 100644
--- a/src/codec.jl
+++ b/src/codec.jl
@@ -4,18 +4,17 @@ const MSB = 0x80
 const MASK7 = 0x7f
 const MASK8 = 0xff
 const MASK3 = 0x07
-function MASKN(nbits)
-    T = byt2uitype_small(bit2bytewidth(nbits))
+Base.@pure function MASKN(nbits::UInt8, ::Type{T}=byt2uitype_small(bit2bytewidth(nbits))) where {T}
     O = convert(T, 0x1)
     (O << nbits) - O
 end
 
-bitwidth(i) = ceil(Int, log(2, i+1))
-bytewidth(i) = bit2bytewidth(bitwidth(i))
-bit2bytewidth(i) = ceil(Int, i/8)
-byt2itype(i) = (i <= 4) ? Int32 : (i <= 8) ? Int64 : Int128
-byt2uitype(i) = (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128
-byt2uitype_small(i) = (i <= 1) ? UInt8 : (i <= 2) ? UInt16 : (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128
+Base.@pure bitwidth(i::Int) = ceil(Int, log(2, i+1))
+#bytewidth(i) = bit2bytewidth(bitwidth(i))
+Base.@pure bit2bytewidth(i::UInt8) = ceil(Int, i/8)
+Base.@pure byt2itype(i::Int) = (i <= 4) ? Int32 : (i <= 8) ? Int64 : Int128
+Base.@pure byt2uitype(i::Int) = (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128
+Base.@pure byt2uitype_small(i::Int) = (i <= 1) ? UInt8 : (i <= 2) ? UInt16 : (i <= 4) ? UInt32 : (i <= 8) ? UInt64 : UInt128
 
 read_fixed(io::IO, typ::Type{UInt32}) = _read_fixed(io, convert(UInt32,0), 4)
 read_fixed(io::IO, typ::Type{UInt64}) = _read_fixed(io, convert(UInt64,0), 8)
@@ -72,19 +71,19 @@ function read_plain(io::IO, typ::Int32, jtype::Type{T}=PLAIN_JTYPES[typ+1]) wher
 end
 
 # read plain values or dictionary (PLAIN_DICTIONARY = 2)
-function read_plain_values(io::IO, count::Integer, typ::Int32)
-    @debug("reading plain values", type=typ, count=count)
+function read_plain_values(io::IO, count::Int32, typ::Int32)
+    #@debug("reading plain values", type=typ, count=count)
     if typ == _Type.BOOLEAN
         arr = read_bitpacked_booleans(io, count)
     else
         arr = [read_plain(io, typ) for i in 1:count]
     end
-    @debug("read $(length(arr)) plain values")
+    #@debug("read $(length(arr)) plain values")
     arr
 end
 
-function read_bitpacked_booleans(io::IO, count::Integer) #, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count); read_len::Bool=true) where {T <: Integer}
-    @debug("reading bitpacked booleans", count)
+function read_bitpacked_booleans(io::IO, count::Int32)
+    #@debug("reading bitpacked booleans", count)
     arr = falses(count)
     arrpos = 1
     bits = UInt8(0)
@@ -92,7 +91,7 @@ function read_bitpacked_booleans(io::IO, count::Integer) #, bits::Integer, byt::
     while arrpos <= count
         if bitpos > 8
             bits = read(io, UInt8)
-            @debug("bits", bits, bitstring(bits))
+            #@debug("bits", bits, bitstring(bits))
             bitpos = 1
         end
         arr[arrpos] = Bool(bits & 0x1)
@@ -104,16 +103,16 @@ function read_bitpacked_booleans(io::IO, count::Integer) #, bits::Integer, byt::
 end
 
 # read rle dictionary (RLE_DICTIONARY = 8, or PLAIN_DICTIONARY = 2 in a data page)
-function read_rle_dict(io::IO, count::Integer)
+function read_rle_dict(io::IO, count::Int32)
     bits = read(io, UInt8)
-    @debug("reading rle dictionary bits:$bits")
+    #@debug("reading rle dictionary bits:$bits")
     arr = read_hybrid(io, count, bits; read_len=false)
-    @debug("read $(length(arr)) dictionary values")
+    #@debug("read $(length(arr)) dictionary values")
     arr
 end
 
 # read RLE or bit backed format (RLE = 3)
-function read_hybrid(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count); read_len::Bool=true) where {T <: Integer}
+function read_hybrid(io::IO, count::Int32, bits::UInt8, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count); read_len::Bool=true) where {T <: Integer}
     len = read_len ? read_fixed(io, Int32) : Int32(0)
     @debug("reading hybrid data", len, count, bits)
     arrpos = 1
@@ -137,13 +136,13 @@ function read_hybrid(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewid
     arr
 end
 
-function read_rle_run(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(count)) where {T <: Integer}
-    @debug("read_rle_run. count:$count, typ:$T, nbits:$bits, nbytes:$byt")
+function read_rle_run(io::IO, count::Int, bits::UInt8, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(count)) where {T <: Integer}
+    @debug("read_rle_run", count, T, bits, byt)
     arr[1:count] .= reinterpret(T, _read_fixed(io, zero(byt2uitype(byt)), byt))
     arr
 end
 
-function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, grp_count*8)) where {T <: Integer}
+function read_bitpacked_run(io::IO, grp_count::Int, bits::UInt8, byt::Int, typ::Type{T}, arr::Vector{T}, mask::V=MASKN(bits)) where {T <: Integer, V <: Integer}
     count = min(grp_count * 8, length(arr))
     # multiple of 8 values at a time are bit packed together
     nbytes = bits * grp_count # same as: round(Int, (bits * grp_count * 8) / 8)
@@ -151,11 +150,9 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int=
     data = Array{UInt8}(undef, min(nbytes, bytesavailable(io)))
     read!(io, data)
 
-    mask = MASKN(bits)
-    V = typeof(mask)
     bitbuff = zero(V)
-    nbitsbuff = 0
-    shift = 0
+    nbitsbuff = UInt8(0)
+    shift = UInt8(0)
 
     arridx = 1
     dataidx = 1
@@ -165,9 +162,9 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int=
             # we have leftover bits, which must be appended
             if nbitsbuff < bits
                 # but only append if we need to read more in this cycle
-                arr[arridx] = bitbuff & MASKN(nbitsbuff)
+                @inbounds arr[arridx] = bitbuff & MASKN(nbitsbuff, V)
                 shift = nbitsbuff
-                nbitsbuff = 0
+                nbitsbuff = UInt8(0)
                 bitbuff = zero(V)
             end
         end
@@ -177,7 +174,7 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int=
              # shift 8 bits and read directly into bitbuff
             bitbuff |= (V(data[dataidx]) << nbitsbuff)
             dataidx += 1
-            nbitsbuff += 8
+            nbitsbuff += UInt8(8)
         end
 
         # set values
@@ -188,7 +185,7 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int=
                 arr[arridx] |= convert(T, (bitbuff << shift) & mask)
                 bitbuff >>= remshift
                 nbitsbuff -= remshift
-                shift = 0
+                shift = UInt8(0)
             else
                 #@debug("setting all from bitbuff nbitsbuff:$nbitsbuff")
                 arr[arridx] = convert(T, bitbuff & mask)
@@ -202,17 +199,15 @@ function read_bitpacked_run(io::IO, grp_count::Integer, bits::Integer, byt::Int=
 end
 
 # read bit packed in deprecated format (BIT_PACKED = 4)
-function read_bitpacked_run_old(io::IO, count::Integer, bits::Integer, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count)) where {T <: Integer}
+function read_bitpacked_run_old(io::IO, count::Int, bits::UInt8, byt::Int=bit2bytewidth(bits), typ::Type{T}=byt2itype(byt), arr::Vector{T}=Array{T}(undef, count), mask::V=MASKN(bits)) where {T <: Integer, V <: Integer}
     # multiple of 8 values at a time are bit packed together
     nbytes = round(Int, (bits * count) / 8)
-    @debug("read_bitpacked_run. count:$count, nbytes:$nbytes, nbits:$bits")
+    #@debug("read_bitpacked_run. count:$count, nbytes:$nbytes, nbits:$bits")
     data = Array{UInt8}(undef, nbytes)
     read!(io, data)
 
     # the mask is of the smallest bounding type for bits
     # T is one of the types that map on to the appropriate Julia type in Parquet (which may be larger than the mask type)
-    mask = MASKN(bits)
-    V = typeof(mask)
     bitbuff = zero(V)
     nbitsbuff = 0
 
diff --git a/src/cursor.jl b/src/cursor.jl
index bfc6d43..1c7bf72 100644
--- a/src/cursor.jl
+++ b/src/cursor.jl
@@ -127,7 +127,7 @@ function setrow(cursor::ColCursor{T}, row::Int64) where {T}
     end
 
     # find the column chunk with the row
-    if cursor.ccrange===nothing || !(row in cursor.ccrange)
+    if (cursor.ccrange === nothing) || !(row in cursor.ccrange)
         offset = rowgroup_offset(cursor.row) # the offset of row from beginning of current rowgroup
         colchunks = cursor.colchunks
 
@@ -137,7 +137,7 @@ function setrow(cursor::ColCursor{T}, row::Int64) where {T}
             if isempty(repn_levels)
                 nrowscc = length(vals) # number of values is number of rows
             else
-                nrowscc = length(repn_levels) - length(find(repn_levels))   # number of values where repeation level is 0
+                nrowscc = length(repn_levels) - length(find(repn_levels))   # number of values where repetition level is 0
             end
             ccrange = startrow:(startrow + nrowscc)
 
@@ -156,49 +156,55 @@ function setrow(cursor::ColCursor{T}, row::Int64) where {T}
         end
     end
 
-    # find the starting positions for values and levels
-    ccrange = cursor.ccrange
-    defn_levels = cursor.defn_levels
-    repn_levels = cursor.repn_levels
-    levelpos = valpos = Int64(0)
-
-    # compute the level and value pos for row
-    if isempty(repn_levels)
-        # no repetitions, so each entry corresponds to one full row
-        levelpos = row - first(ccrange) + 1
-        levelrange = levelpos:levelpos
+    if cursor.ccrange === nothing
+        # we did not find the row in this column
+        cursor.valpos = cursor.levelpos = 0
+        cursor.levelrange = 0:-1 #cursor.valrange = 0:-1
     else
-        # multiple entries may constitute one row
-        idx = first(ccrange)
-        levelpos = findfirst(repn_levels, 0) # NOTE: can start from cursor.levelpos to optimize, but that will prevent using setrow to go backwards
-        while idx < row
-            levelpos = findnext(repn_levels, 0, levelpos+1)
-            idx += 1
+        # find the starting positions for values and levels
+        ccrange = cursor.ccrange
+        defn_levels = cursor.defn_levels
+        repn_levels = cursor.repn_levels
+        levelpos = valpos = Int64(0)
+
+        # compute the level and value pos for row
+        if isempty(repn_levels)
+            # no repetitions, so each entry corresponds to one full row
+            levelpos = row - first(ccrange) + 1
+            levelrange = levelpos:levelpos
+        else
+            # multiple entries may constitute one row
+            idx = first(ccrange)
+            levelpos = findfirst(repn_levels, 0) # NOTE: can start from cursor.levelpos to optimize, but that will prevent using setrow to go backwards
+            while idx < row
+                levelpos = findnext(repn_levels, 0, levelpos+1)
+                idx += 1
+            end
+            levelend = max(findnext(repn_levels, 0, levelpos+1)-1, length(repn_levels))
+            levelrange = levelpos:levelend
         end
-        levelend = max(findnext(repn_levels, 0, levelpos+1)-1, length(repn_levels))
-        levelrange = levelpos:levelend
-    end
 
-    # compute the val pos for row
-    if isempty(defn_levels)
-        # all entries are required, so there must be a corresponding value
-        valpos = levelpos
-        #valrange = levelrange
-    else
-        maxdefn = cursor.maxdefn
-        if ccincr
-            valpos = cursor.valpos
+        # compute the val pos for row
+        if isempty(defn_levels)
+            # all entries are required, so there must be a corresponding value
+            valpos = levelpos
+            #valrange = levelrange
         else
-            valpos = sum(view(defn_levels, 1:(levelpos-1)) .== maxdefn) + 1
+            maxdefn = cursor.maxdefn
+            if ccincr
+                valpos = cursor.valpos
+            else
+                valpos = sum(view(defn_levels, 1:(levelpos-1)) .== maxdefn) + 1
+            end
+            #nvals = sum(sub(defn_levels, levelrange) .== maxdefn)
+            #valrange = valpos:(valpos+nvals-1)
         end
-        #nvals = sum(sub(defn_levels, levelrange) .== maxdefn)
-        #valrange = valpos:(valpos+nvals-1)
-    end
 
-    cursor.levelpos = levelpos
-    cursor.levelrange = levelrange
-    cursor.valpos = valpos
-    #cursor.valrange = valrange
+        cursor.levelpos = levelpos
+        cursor.levelrange = levelrange
+        cursor.valpos = valpos
+        #cursor.valrange = valrange
+    end
     nothing
 end
 
@@ -209,7 +215,7 @@ function _start(cursor::ColCursor)
 end
 function _done(cursor::ColCursor, rowandlevel::Tuple{Int64,Int64})
     row, levelpos = rowandlevel
-    (levelpos > last(cursor.levelrange)) && _done(cursor.row, row)
+    (levelpos > last(cursor.levelrange)) || _done(cursor.row, row)
 end
 function _next(cursor::ColCursor{T}, rowandlevel::Tuple{Int64,Int64}) where {T}
     # find values for current row and level in row
@@ -253,28 +259,27 @@ mutable struct RecordCursor{T}
     colnames::Vector{Vector{String}}
     colcursors::Vector{ColCursor}
     colstates::Vector{Tuple{Int64,Int64}}
+    rows::UnitRange{Int64}                      # rows to scan over
+    row::Int64                                  # current row
 end
 
 function RecordCursor(par::ParFile; rows::UnitRange=1:nrows(par), colnames::Vector{Vector{String}}=colnames(par), row::Signed=first(rows))
     colcursors = [ColCursor(par, UnitRange{Int64}(rows), colname, Int64(row)) for colname in colnames]
     sch = schema(par)
     rectype = ntelemtype(sch, sch.schema[1])
-    RecordCursor{rectype}(par, colnames, colcursors, Array{Tuple{Int64,Int64}}(undef, length(colcursors)))
+    RecordCursor{rectype}(par, colnames, colcursors, Array{Tuple{Int64,Int64}}(undef, length(colcursors)), rows, row)
 end
 
 eltype(cursor::RecordCursor{T}) where {T} = T
-length(cursor::RecordCursor) = length(first(cursor.colcursors).row.rows)
+length(cursor::RecordCursor) = length(cursor.rows)
 
-function state(cursor::RecordCursor)
-    col1_row, _col1_level = first(cursor.colstates)
-    col1_row # return row as state, picked up from the state of first column
-end
+state(cursor::RecordCursor) = cursor.row
 
 function _start(cursor::RecordCursor)
     cursor.colstates = [_start(colcursor) for colcursor in cursor.colcursors]
     state(cursor)
 end
-_done(cursor::RecordCursor, row::Int64) = _done(cursor.colcursors[1].row, row)
+_done(cursor::RecordCursor, row::Int64) = (row > last(cursor.rows))
 
 function _next(cursor::RecordCursor{T}, _row::Int64) where {T}
     states = cursor.colstates
@@ -284,11 +289,14 @@ function _next(cursor::RecordCursor{T}, _row::Int64) where {T}
     col_repeat_state = Dict{AbstractString,Int}()
     for colid in 1:length(states)                                                               # for each column
         colcursor = cursors[colid]
-        colval, colstate = _next(colcursor, states[colid])                                      # for each value, defn level, repn level in column
-        val, def, rep = colval
-        update_record(cursor.par, row, colcursor.colname, val, def, rep, col_repeat_state)      # update record
-        states[colid] = colstate                                                                # set last state to states
+        if !_done(colcursor, states[colid])
+            colval, colstate = _next(colcursor, states[colid])                                      # for each value, defn level, repn level in column
+            val, def, rep = colval
+            update_record(cursor.par, row, colcursor.colname, val, def, rep, col_repeat_state)      # update record
+            states[colid] = colstate                                                                # set last state to states
+        end
     end
+    cursor.row += 1
     _nt(row, T), state(cursor)
 end
 
diff --git a/src/reader.jl b/src/reader.jl
index 6cd5f74..7e27869 100644
--- a/src/reader.jl
+++ b/src/reader.jl
@@ -130,13 +130,13 @@ function rowgroups(par::ParFile, cnames::Vector{Vector{String}}, rowrange::UnitR
         cnamesrg = colnames(rowgrp)
         found = length(intersect(cnames, cnamesrg))
         endrow = beginrow + rowgrp.num_rows - 1
-        (found == L) && (length(intersect(beginrow:endrow)) > 0) && push!(R, rowgrp)
+        (found == L) && (length(beginrow:endrow) > 0) && push!(R, rowgrp)
         beginrow = endrow + 1
     end
     R
 end
 
-columns(par::ParFile, rowgroupidx::Integer) = columns(par, rowgroups(par)[rowgroupidx])
+columns(par::ParFile, rowgroupidx) = columns(par, rowgroups(par)[rowgroupidx])
 columns(par::ParFile, rowgroup::RowGroup) = rowgroup.columns
 columns(par::ParFile, rowgroup::RowGroup, colname::Vector{String}) = columns(par, rowgroup, [colname])
 function columns(par::ParFile, rowgroup::RowGroup, cnames::Vector{Vector{String}})
@@ -166,8 +166,8 @@ function _pagevec(par::ParFile, col::ColumnChunk)
     end
     pagevec
 end
-pages(par::ParFile, rowgroupidx::Integer, colidx::Integer) = pages(par, columns(par, rowgroupidx), colidx)
-pages(par::ParFile, cols::Vector{ColumnChunk}, colidx::Integer) = pages(par, cols[colidx])
+pages(par::ParFile, rowgroupidx, colidx) = pages(par, columns(par, rowgroupidx), colidx)
+pages(par::ParFile, cols::Vector{ColumnChunk}, colidx) = pages(par, cols[colidx])
 pages(par::ParFile, col::ColumnChunk) = cacheget(par.page_cache, col, col->_pagevec(par,col))
 
 function bytes(page::Page, uncompressed::Bool=true)
@@ -195,8 +195,8 @@ end
 
 map_dict_vals(valdict::Vector{T1}, vals::Vector{T2}) where {T1, T2} = isempty(valdict) ? vals : [valdict[v+1] for v in vals]
 
-values(par::ParFile, rowgroupidx::Integer, colidx::Integer) = values(par, columns(par, rowgroupidx), colidx)
-values(par::ParFile, cols::Vector{ColumnChunk}, colidx::Integer) = values(par, cols[colidx])
+values(par::ParFile, rowgroupidx, colidx) = values(par, columns(par, rowgroupidx), colidx)
+values(par::ParFile, cols::Vector{ColumnChunk}, colidx) = values(par, cols[colidx])
 function values(par::ParFile, col::ColumnChunk)
     ctype = coltype(col)
     pgs = pages(par, col)
@@ -232,8 +232,8 @@ function values(par::ParFile, col::ColumnChunk)
     vals, defn_levels, repn_levels
 end
 
-function read_levels(io::IO, max_val::Integer, enc::Int32, num_values::Integer)
-    bw = bitwidth(max_val)
+function read_levels(io::IO, max_val::Int, enc::Int32, num_values::Int32)
+    bw = UInt8(bitwidth(max_val))
     (bw == 0) && (return Int[])
     @debug("reading levels. enc:$enc ($(Thrift.enumstr(Encoding,enc))), max_val:$max_val, num_values:$num_values")
 
@@ -249,7 +249,7 @@ function read_levels(io::IO, max_val::Integer, enc::Int32, num_values::Integer)
     end
 end
 
-function read_values(io::IO, enc::Int32, typ::Int32, num_values::Integer)
+function read_values(io::IO, enc::Int32, typ::Int32, num_values::Int32)
     @debug("reading values. enc:$enc ($(Thrift.enumstr(Encoding,enc))), num_values:$num_values")
 
     if enc == Encoding.PLAIN
@@ -280,7 +280,7 @@ function values(par::ParFile, page::Page)
     end
 end
 
-function read_levels_and_values(io::IO, encs::Tuple, ctype::Int32, num_values::Integer, par::ParFile, page::Page)
+function read_levels_and_values(io::IO, encs::Tuple, ctype::Int32, num_values::Int32, par::ParFile, page::Page)
     cname = colname(page.colchunk)
     enc, defn_enc, rep_enc = encs
 
@@ -298,7 +298,7 @@ function read_levels_and_values(io::IO, encs::Tuple, ctype::Int32, num_values::I
     # where defn_levels's elements == 1 are present and only
     # sum(defn_levels) values can be read.
     # because defn_levels == 0 are where the missing vlaues are
-    nmissing = sum(==(0), defn_levels)
+    nmissing = Int32(sum(==(0), defn_levels))
     vals = read_values(io, enc, ctype, num_values - nmissing)
 
     vals, defn_levels, repn_levels
diff --git a/test/test_codec.jl b/test/test_codec.jl
index 329d01a..cadc3fc 100644
--- a/test/test_codec.jl
+++ b/test/test_codec.jl
@@ -5,7 +5,7 @@ function test_codec()
     println("testing reading bitpacked run (old scheme)...")
     let data = UInt8[0x05, 0x39, 0x77]
         io = PipeBuffer(data)
-        decoded = Parquet.read_bitpacked_run_old(io, 8, 3)
+        decoded = Parquet.read_bitpacked_run_old(io, 8, UInt8(3))
         @test decoded == Int32[0:7;]
     end
     println("passed.")
@@ -13,7 +13,11 @@ function test_codec()
     println("testing reading bitpacked run...")
     let data = UInt8[0x88, 0xc6, 0xfa]
         io = PipeBuffer(data)
-        decoded = Parquet.read_bitpacked_run(io, 1, 3)
+        bits = UInt8(3)
+        byt = Parquet.bit2bytewidth(bits)
+        itype = Parquet.byt2itype(byt)
+        arr = Array{itype}(undef, 8)
+        decoded = Parquet.read_bitpacked_run(io, 1, bits, byt, itype, arr)
         @test decoded == Int32[0:7;]
     end
     println("passed.")

From 0a822aeb5582618bec4d9795537115b68ec34404 Mon Sep 17 00:00:00 2001
From: tan <tanmaykm@gmail.com>
Date: Tue, 19 May 2020 12:46:17 +0530
Subject: [PATCH 30/52] more performance fixes

---
 src/cursor.jl | 48 ++++++++++++++++++++++++++++--------------------
 src/reader.jl |  8 ++++++++
 src/schema.jl | 22 +++++++++++-----------
 3 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/src/cursor.jl b/src/cursor.jl
index 1c7bf72..71e46c2 100644
--- a/src/cursor.jl
+++ b/src/cursor.jl
@@ -227,10 +227,10 @@ function _next(cursor::ColCursor{T}, rowandlevel::Tuple{Int64,Int64}) where {T}
     repn_level = isempty(cursor.repn_levels) ? 0 : cursor.repn_levels[levelpos]
     cursor.levelpos += 1
     if defn_level == maxdefn
-        val = (cursor.vals[cursor.valpos])::Union{Nothing,T}
+        val = (cursor.vals[cursor.valpos])::T
         cursor.valpos += 1
     else
-        val = (nothing)::Union{Nothing,T}
+        val = nothing
     end
 
     # advance row
@@ -239,10 +239,10 @@ function _next(cursor::ColCursor{T}, rowandlevel::Tuple{Int64,Int64}) where {T}
         setrow(cursor, row)
     end
 
-    (val, defn_level, repn_level), (row, cursor.levelpos)
+    NamedTuple{(:value, :defn_level, :repn_level),Tuple{Union{Nothing,T},Int64,Int64}}((val, defn_level, repn_level)), (row, cursor.levelpos)
 end
 
-function Base.iterate(cursor::ColCursor, state)
+function Base.iterate(cursor::ColCursor{T}, state) where {T}
     _done(cursor, state) && return nothing
     return _next(cursor, state)
 end
@@ -286,15 +286,11 @@ function _next(cursor::RecordCursor{T}, _row::Int64) where {T}
     cursors = cursor.colcursors
 
     row = Dict{Symbol,Any}()
-    col_repeat_state = Dict{AbstractString,Int}()
+    col_repeat_state = Dict{Tuple{Int,Int},Int}()
     for colid in 1:length(states)                                                               # for each column
         colcursor = cursors[colid]
-        if !_done(colcursor, states[colid])
-            colval, colstate = _next(colcursor, states[colid])                                      # for each value, defn level, repn level in column
-            val, def, rep = colval
-            update_record(cursor.par, row, colcursor.colname, val, def, rep, col_repeat_state)      # update record
-            states[colid] = colstate                                                                # set last state to states
-        end
+        colstate = states[colid]
+        states[colid] = update_record(cursor.par, row, colid, colcursor, colstate, col_repeat_state)
     end
     cursor.row += 1
     _nt(row, T), state(cursor)
@@ -310,20 +306,32 @@ function Base.iterate(cursor::RecordCursor{T}) where {T}
     return r
 end
 
-function _nt(dict::Dict{Symbol,Any}, ::Type{T}) where {T}
-    _val_or_missing = (idx,k) -> begin
-        v = get(dict, k, missing)
-        isa(v, Dict{Symbol,Any}) ? _nt(v, T.types[idx]) : v
+function _val_or_missing(dict::Dict{Symbol,Any}, k::Symbol, ::Type{T}) where {T}
+    v = get(dict, k, missing)
+    (isa(v, Dict{Symbol,Any}) ? _nt(v, T) : v)::T
+end
+
+@generated function _nt(dict::Dict{Symbol,Any}, ::Type{T}) where {T}
+    names = fieldnames(T)
+    strnames = ["$n" for n in names]
+    quote
+        return T(($([:(_val_or_missing(dict,Symbol($(strnames[i])),$(fieldtype(T,i)))) for i in 1:length(names)]...),))
     end
-    values = [_val_or_missing(idx,k) for (idx,k) in enumerate(T.names)]
-    T((values...,))
 end
 
 default_init(::Type{Vector{T}}) where {T} = Vector{T}()
 default_init(::Type{Dict{Symbol,Any}}) = Dict{Symbol,Any}()
 default_init(::Type{T}) where {T} = ccall(:jl_new_struct_uninit, Any, (Any,), T)::T
 
-function update_record(par::ParFile, row::Dict{Symbol,Any}, nameparts::Vector{String}, val, defn_level::Signed, repn_level::Signed, col_repeat_state::Dict{AbstractString,Int})
+function update_record(par::ParFile, row::Dict{Symbol,Any}, colid::Int, colcursor::ColCursor{T}, colcursor_state::Tuple{Int64,Int64}, col_repeat_state::Dict{Tuple{Int,Int},Int}) where {T}
+    if !_done(colcursor, colcursor_state)
+        colval, colcursor_state = _next(colcursor, colcursor_state)                                                         # for each value, defn level, repn level in column
+        update_record(par, row, colid, colcursor.colname, colval.value, colval.defn_level, colval.repn_level, col_repeat_state)    # update record
+    end
+    colcursor_state # return new colcursor state
+end
+
+function update_record(par::ParFile, row::Dict{Symbol,Any}, colid::Int, nameparts::Vector{String}, val, defn_level::Int64, repn_level::Int64, col_repeat_state::Dict{Tuple{Int,Int},Int})
     lparts = length(nameparts)
     sch = par.schema
     F = row  # the current field corresponding to the level in nameparts
@@ -332,7 +340,7 @@ function update_record(par::ParFile, row::Dict{Symbol,Any}, nameparts::Vector{St
 
     # for each name part of colname (a field)
     for idx in 1:lparts
-        colname = nameparts[1:idx]
+        colname = view(nameparts, 1:idx)
         #@debug("updating part $colname of $nameparts isnull:$(val === nothing), def:$(defn_level), rep:$(repn_level)")
         leaf = nameparts[idx]
         symleaf = Symbol(leaf)
@@ -345,7 +353,7 @@ function update_record(par::ParFile, row::Dict{Symbol,Any}, nameparts::Vector{St
         defined = ((val === nothing) || (idx < lparts)) ? haskey(F, symleaf) : false
         mustdefine = defn_level >= Fdefn
         mustrepeat = repeated && (repn_level == Frepn)
-        repkey = join(nameparts, '.') * ":" * join(colname, '.')
+        repkey = (colid, idx) #join(nameparts, '.') * ":" * string(idx) #join(colname, '.')
         repidx = get(col_repeat_state, repkey, 0)
         if mustrepeat
             repidx += 1
diff --git a/src/reader.jl b/src/reader.jl
index 7e27869..46449b0 100644
--- a/src/reader.jl
+++ b/src/reader.jl
@@ -218,6 +218,14 @@ function values(par::ParFile, col::ColumnChunk)
             enc, defn_enc, rep_enc = page_encodings(pg)
             if enc == Encoding.PLAIN_DICTIONARY || enc == Encoding.RLE_DICTIONARY
                 append!(vals, map_dict_vals(valdict, _vals))
+                #=
+                if isempty(valdict)
+                    append!(vals, _vals)
+                else
+                    mapped_vals = [valdict[v+1] for v in _vals]
+                    append!(vals, mapped_vals)
+                end
+                =#
             else
                 append!(vals, _vals)
             end
diff --git a/src/schema.jl b/src/schema.jl
index fe360c8..7776a92 100644
--- a/src/schema.jl
+++ b/src/schema.jl
@@ -41,23 +41,23 @@ mutable struct Schema
     end
 end
 
-leafname(schname::Vector{String}) = [schname[end]]
+leafname(schname::T) where {T <: AbstractVector{String}} = [schname[end]]
 
-parentname(schname::Vector{String}) = istoplevel(schname) ? schname : schname[1:(end-1)]
+parentname(schname::T) where {T <: AbstractVector{String}} = istoplevel(schname) ? schname : schname[1:(end-1)]
 
 istoplevel(schname::Vector) = !(length(schname) > 1)
 
-elem(sch::Schema, schname::Vector{String}) = sch.name_lookup[schname]
+elem(sch::Schema, schname::T) where {T <: AbstractVector{String}} = sch.name_lookup[schname]
 
 isrepetitiontype(schelem::SchemaElement, repetition_type) = Thrift.isfilled(schelem, :repetition_type) && (schelem.repetition_type == repetition_type)
 
-isrequired(sch::Schema, schname::Vector{String}) = isrequired(elem(sch, schname))
+isrequired(sch::Schema, schname::T) where {T <: AbstractVector{String}} = isrequired(elem(sch, schname))
 isrequired(schelem::SchemaElement) = isrepetitiontype(schelem, FieldRepetitionType.REQUIRED)
 
-isoptional(sch::Schema, schname::Vector{String}) = isoptional(elem(sch, schname))
+isoptional(sch::Schema, schname::T) where {T <: AbstractVector{String}} = isoptional(elem(sch, schname))
 isoptional(schelem::SchemaElement) = isrepetitiontype(schelem, FieldRepetitionType.OPTIONAL)
 
-isrepeated(sch::Schema, schname::Vector{String}) = isrepeated(elem(sch, schname))
+isrepeated(sch::Schema, schname::T) where {T <: AbstractVector{String}} = isrepeated(elem(sch, schname))
 isrepeated(schelem::SchemaElement) = isrepetitiontype(schelem, FieldRepetitionType.REPEATED)
 
 function path_in_schema(sch::Schema, schelem::SchemaElement)
@@ -67,7 +67,7 @@ function path_in_schema(sch::Schema, schelem::SchemaElement)
     error("schema element not found in schema")
 end
 
-function logical_convert(sch::Schema, schname::Vector{String}, val)
+function logical_convert(sch::Schema, schname::T, val) where {T <: AbstractVector{String}}
     elem = sch.name_lookup[schname]
 
     if schname in keys(sch.map_logical_types)
@@ -81,7 +81,7 @@ function logical_convert(sch::Schema, schname::Vector{String}, val)
     end
 end
 
-elemtype(sch::Schema, schname::Vector{String}) = get!(sch.type_lookup, schname) do
+elemtype(sch::Schema, schname::T) where {T <: AbstractVector{String}} = get!(sch.type_lookup, schname) do
     elem = sch.name_lookup[schname]
 
     if schname in keys(sch.map_logical_types)
@@ -111,7 +111,7 @@ function elemtype(schelem::SchemaElement)
     jtype
 end
 
-ntelemtype(sch::Schema, schname::Vector{String}) = get!(sch.nttype_lookup, schname) do
+ntelemtype(sch::Schema, schname::T) where {T <: AbstractVector{String}} = get!(sch.nttype_lookup, schname) do
     ntelemtype(sch, sch.name_lookup[schname])
 end
 function ntelemtype(sch::Schema, schelem::SchemaElement)
@@ -130,12 +130,12 @@ bit_or_byte_length(schelem::SchemaElement) = Thrift.isfilled(schelem, :type_leng
 
 num_children(schelem::SchemaElement) = Thrift.isfilled(schelem, :num_children) ? schelem.num_children : 0
 
-function max_repetition_level(sch::Schema, schname::Vector{String})
+function max_repetition_level(sch::Schema, schname::T) where {T <: AbstractVector{String}}
     lev = isrepeated(sch, schname) ? 1 : 0
     istoplevel(schname) ? lev : (lev + max_repetition_level(sch, parentname(schname)))
 end 
 
-function max_definition_level(sch::Schema, schname::Vector{String})
+function max_definition_level(sch::Schema, schname::T) where {T <: AbstractVector{String}}
     lev = isrequired(sch, schname) ? 0 : 1
     istoplevel(schname) ? lev : (lev + max_definition_level(sch, parentname(schname)))
 end 

From d4f8a94cc1d5ced264074c74f21e071b0282cae1 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Tue, 19 May 2020 21:18:37 +1000
Subject: [PATCH 31/52] minor

---
 .gitignore           |  6 ++++
 Project.toml         |  2 ++
 src/Parquet.jl       |  5 +++
 src/column_reader.jl | 79 ++++++++++++++++++++++++++++++++++++++++++++
 src/encoding.jl      | 14 ++++++++
 src/metadata.jl      | 13 ++++++++
 src/read_parquet.jl  | 56 +++++++++++++++++++++++++++++++
 7 files changed, 175 insertions(+)
 create mode 100644 src/column_reader.jl
 create mode 100644 src/encoding.jl
 create mode 100644 src/metadata.jl
 create mode 100644 src/read_parquet.jl

diff --git a/.gitignore b/.gitignore
index c4f35ef..0d9aedb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,9 @@
 parquet-compatibility/
 julia-parquet-compatibility/
 .vscode/settings.json
+Manifest.toml
+parquet.code-workspace
+src/column_reader_rewrite.jl
+src/column_reader_to_vals.jl
+src/column_reader-dev.jl
+src/read_parquet-test.jl
diff --git a/Project.toml b/Project.toml
index 2c22e9d..070aa30 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,8 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+NamedTupleTools = "d9ec5142-1e00-5aa0-9d6a-321866360f50"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22"
diff --git a/src/Parquet.jl b/src/Parquet.jl
index 313f8dd..1e84987 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -22,6 +22,7 @@ export logical_timestamp, logical_string
 
 export RecordCursor
 export write_parquet
+export read_parquet
 
 # package code goes here
 include("PAR2/PAR2.jl")
@@ -32,5 +33,9 @@ include("reader.jl")
 include("cursor.jl")
 include("show.jl")
 include("writer.jl")
+include("encoding.jl")
+include("metadata.jl")
+include("column_reader.jl")
+include("read_parquet.jl")
 
 end # module
diff --git a/src/column_reader.jl b/src/column_reader.jl
new file mode 100644
index 0000000..d91d20a
--- /dev/null
+++ b/src/column_reader.jl
@@ -0,0 +1,79 @@
+
+const TYPES = (Bool, Int32, Int64, Int128, Float32, Float64, String, UInt8)
+
+read_column(path, col_num) = read_column(path, metadata(path), col_num)
+
+function read_column(path, filemetadata, col_num)
+    par = ParFile(path)
+
+    T = TYPES[filemetadata.schema[col_num+1]._type+1]
+    # TODO detect if missing is necessary
+    res = Vector{Union{Missing, T}}(missing, nrows(par))
+    write_cursor = 1
+    for row_group in filemetadata.row_groups
+        pgs = pages(par, row_group.columns[col_num])
+
+        drop_page_count = 0
+        # is the first page a dictionary page
+        # this is not the case for boolean values for example
+        if isfilled(pgs[1].hdr, :dictionary_page_header)
+            # the first page is almost always the dictionary page
+            dictionary_page = pgs[1]
+            drop_page_count = 1
+            dictionary_of_values = T.(values(par, dictionary_page)[1])
+        end
+
+        # TODO deal with other types of pages e.g. dataheaderv2
+
+        # everything after the first data datapages
+        for data_page in Base.Iterators.drop(pgs, drop_page_count)
+            vals, definitions, decode = values(par, data_page)
+
+            @assert all(in((0, 1)), definitions)
+
+            l = sum(==(1), definitions)
+            # if all definitions values are 1 then it's not used
+            definitions_not_used = all(==(1), definitions)
+
+            # data_page can be either
+            # * dictionary-encoded in which case we should look into the dictionary
+            # * plained-encoded in which case just return the values
+            page_encoding = Parquet.page_encoding(data_page)
+
+            if page_encoding == Encoding.PLAIN_DICTIONARY
+                if definitions_not_used
+                    res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1]
+                else
+                    val_index = 1
+                    for (offset, definition) in enumerate(definitions)
+                        if definition != 0
+                            value = vals[val_index]
+                            res[write_cursor+offset-1] = dictionary_of_values[value + 1]
+                            val_index += 1
+                        end
+                    end
+                end
+            elseif page_encoding == Encoding.PLAIN
+                if definitions_not_used
+                    res[write_cursor:write_cursor+l-1] .= T.(vals)
+                else
+                    val_index = 1
+                    for (offset, definition)  in enumerate(definitions)
+                        if definition != 0
+                            value = vals[val_index]
+                            res[write_cursor+offset-1] = T(value)
+                            val_index += 1
+                        end
+                    end
+                end
+            else
+                error("page encoding not supported yet")
+            end
+
+            write_cursor += length(definitions)
+        end
+    end
+    return res
+end
+
+()
diff --git a/src/encoding.jl b/src/encoding.jl
new file mode 100644
index 0000000..8efd5b6
--- /dev/null
+++ b/src/encoding.jl
@@ -0,0 +1,14 @@
+# obtain the encoding of the page
+using Thrift: isfilled
+
+function page_encoding(page::Page)
+    if isfilled(page.hdr, :data_page_header)
+        return page.hdr.data_page_header.encoding
+    elseif isfilled(page.hdr, :data_page_header_v2)
+        return page.hdr.data_page_header_v2.encoding
+    elseif isfilled(page.hdr, :dictionary_page_header)
+        return page.hdr.dictionary_page_header.encoding
+    else
+        error("not supported page")
+    end
+end
diff --git a/src/metadata.jl b/src/metadata.jl
new file mode 100644
index 0000000..1c8c5af
--- /dev/null
+++ b/src/metadata.jl
@@ -0,0 +1,13 @@
+using Thrift
+
+function metadata(path)
+    io = open(path)
+    sz = filesize(io)
+    seek(io, sz - SZ_PAR_MAGIC - SZ_FOOTER)
+
+    # read footer size as little endian signed Int32
+    meta_len = read(io, Int32)
+    datasize = sz - meta_len - 2SZ_PAR_MAGIC - SZ_FOOTER
+    seek(io, SZ_PAR_MAGIC + datasize)
+    filemetadata = read_thrift(io, PAR2.FileMetaData)
+end
diff --git a/src/read_parquet.jl b/src/read_parquet.jl
new file mode 100644
index 0000000..4916a6c
--- /dev/null
+++ b/src/read_parquet.jl
@@ -0,0 +1,56 @@
+using Base.Threads: @spawn
+using Base.Iterators: drop
+using ProgressMeter: @showprogress
+using NamedTupleTools: namedtuple
+
+read_parquet(path, cols::Vector{Symbol}; kwargs...) = read_parquet(path, String.(cols); kwargs...)
+
+read_parquet(path; kwargs...) = read_parquet(path, String[]; kwargs...)
+
+function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose = false)
+	"""function for reading parquet"""
+
+	if multithreaded
+		# use a bounded channel to limit
+	    c1 = Channel{Bool}(Threads.nthreads())
+	    atexit(()->close(c1))
+	end
+
+	nc = ncols(ParFile(path))
+
+	colnames = [sch.name for sch in  drop(ParFile(path).schema.schema, 1)]
+
+	if length(cols) == 0
+		colnums = collect(1:nc)
+	else
+		colnums = [findfirst(==(c), colnames) for c in cols]
+	end
+
+	results = Vector{Any}(undef, length(colnums))
+
+	filemetadata = metadata(path)
+
+	if multithreaded
+		@showprogress for (i, j) in enumerate(colnums)
+			put!(c1, true)
+			results[i] = @spawn begin
+				res = read_column(path, filemetadata, j)
+				take!(c1)
+				res
+			end
+		end
+	else
+		@showprogress for (i, j) in enumerate(colnums)
+			results[i] = read_column(path, filemetadata, j)
+		end
+	end
+
+	symbol_col_names = collect(Symbol(col) for col in colnames[colnums])
+
+	if multithreaded
+		fnl_results = collect(fetch(result) for result in results)
+		return namedtuple(symbol_col_names, fnl_results)
+	else
+		return namedtuple(symbol_col_names, results)
+	end
+end

From 8432d5c6df693404f9ee6e605b7c992f4bc017c6 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Tue, 19 May 2020 21:21:59 +1000
Subject: [PATCH 32/52] Update README.md

---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 9a42604..8012ec4 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,16 @@
 
 ## Reader
 
+### High level reader
+
+You can read a parquet file using `read_parquet` for example
+
+```
+df =  read_parquet(parquet_file_path);
+```
+
+### Lower level reader
+
 Load a [parquet file](https://en.wikipedia.org/wiki/Apache_Parquet). Only metadata is read initially, data is loaded in chunks on demand. (Note: [ParquetFiles.jl](https://github.com/queryverse/ParquetFiles.jl) also provides load support for Parquet files under the FileIO.jl package.)
 
 `ParFile` represents a Parquet file at `path` open for reading. Options to map logical types can be provided via `map_logical_types`.

From fb2b3c2f57ffc2a40ba4efda0f7704fcd35c92a8 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Thu, 21 May 2020 22:52:28 +1000
Subject: [PATCH 33/52] tries to accomodate master

---
 test/test_writer.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index bfb93b7..2b5ae7e 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -33,10 +33,12 @@ function test_write()
     # the file is very small so only one rowgroup
     col_chunks = columns(pf, 1)
 
-    for colnum in 1:length(col_chunks)
+
+    for (colnum, col_chunk) in enumerate(col_chunks)
+        println(colnum)
         correct_vals = tbl[colnum]
         coltype = eltype(correct_vals)
-        vals_from_file = values(pf, col_chunks, colnum)
+        vals_from_file = values(pf, col_chunk)
         if Missing <: coltype
             @test ismissing.(correct_vals) == (vals_from_file[2] .== 0)
         end
@@ -44,7 +46,9 @@ function test_write()
         if nonmissingtype(coltype) == String
             @test all(skipmissing(correct_vals) .== String.(vals_from_file[1]))
         else
-            @test all(skipmissing(correct_vals) .== vals_from_file[1])
+            non_missing_vals = collect(skipmissing(correct_vals))
+            non_missing_vals_read = vals_from_file[1][1:sum(vals_from_file[2])]
+            @test all(non_missing_vals .== non_missing_vals_read)
         end
     end
 
@@ -52,4 +56,4 @@ function test_write()
     close(pf)
 end
 
-test_write()
+# test_write()

From 6b7bd64bb001498f477eb890ca2bd2319e9476b1 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Fri, 22 May 2020 09:49:17 +1000
Subject: [PATCH 34/52] Update test/test_writer.jl

---
 test/test_writer.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index 2b5ae7e..dddafc1 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -35,7 +35,6 @@ function test_write()
 
 
     for (colnum, col_chunk) in enumerate(col_chunks)
-        println(colnum)
         correct_vals = tbl[colnum]
         coltype = eltype(correct_vals)
         vals_from_file = values(pf, col_chunk)

From 275e7f20d41f4d1fc17bb87881b1ad24e2d80194 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Fri, 22 May 2020 13:29:30 +1000
Subject: [PATCH 35/52] added little endian writes

---
 src/writer.jl       | 20 ++++++++++----------
 test/test_writer.jl |  3 ++-
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/writer.jl b/src/writer.jl
index afaadac..260c976 100644
--- a/src/writer.jl
+++ b/src/writer.jl
@@ -70,7 +70,7 @@ function compress_using_codec(colvals::AbstractVector{String}, codec::Int)::Vect
     for val in colvals
         # for string it needs to be stored as BYTE_ARRAY which needs the length
         # to be the first 4 bytes UInt32
-        write(io, val |> sizeof |> UInt32)
+        write(io, val |> sizeof |> UInt32 |> htol)
         # write each of the strings one after another
         write(io, val)
     end
@@ -97,7 +97,7 @@ function write_defn_levels(data_to_compress_io, colvals::AbstractVector{Union{Mi
 
     encoded_defn_data_length = length(bitpacking_header) + bytes_needed
     # write the definition data
-    write(data_to_compress_io, UInt32(encoded_defn_data_length))
+    write(data_to_compress_io, UInt32(encoded_defn_data_length) |> htol)
     write(data_to_compress_io, bitpacking_header)
     write(data_to_compress_io, encoded_defn_data)
 end
@@ -112,7 +112,7 @@ function write_defn_levels(data_to_compress_io, colvals::AbstractVector)
     encoded_defn_data_length = sizeof(rle_header) + sizeof(repeated_value)
 
     # write the definition data
-    write(data_to_compress_io, UInt32(encoded_defn_data_length))
+    write(data_to_compress_io, UInt32(encoded_defn_data_length) |> htol)
     write(data_to_compress_io, rle_header)
     write(data_to_compress_io, repeated_value)
 end
@@ -178,7 +178,7 @@ function write_encoded_data(data_to_compress_io, colvals::Union{AbstractVector{S
     for val in colvals
         # for string it needs to be stored as BYTE_ARRAY which needs the length
         # to be the first 4 bytes UInt32
-        write(data_to_compress_io, val |> sizeof |> UInt32)
+        write(data_to_compress_io, val |> sizeof |> UInt32 |> htol)
         # write each of the strings one after another
         write(data_to_compress_io, val)
     end
@@ -201,13 +201,13 @@ end
 function write_encoded_data(data_to_compress_io, colvals::AbstractArray)
     """ Efficient write of encoded data for `isbits` types"""
     @assert isbitstype(eltype(colvals))
-    write(data_to_compress_io, colvals)
+    write(data_to_compress_io, colvals |> htol)
 end
 
 function write_encoded_data(data_to_compress_io, colvals::SkipMissing)
     """ Write of encoded data for skipped missing types"""
     for val in colvals
-        write(data_to_compress_io, val)
+        write(data_to_compress_io, val |> htol)
     end
 end
 
@@ -216,7 +216,7 @@ function write_encoded_data(data_to_compress_io, colvals)
     The only requirement is that colvals has to be iterable
     """
     for val in skipmissing(colvals)
-        write(data_to_compress_io, val)
+        write(data_to_compress_io, val |> htol)
     end
 end
 
@@ -288,10 +288,10 @@ function write_col_page(fileio, colvals::AbstractArray, codec, ::Val{PAR2.Encodi
     rle_header = LittleEndianBase128.encode(UInt32(length(colvals)) << 1)
     repeated_value = UInt8(1)
 
-    encoded_defn_data_length = UInt32(sizeof(rle_header) + sizeof(repeated_value))
+    encoded_defn_data_length = sizeof(rle_header) + sizeof(repeated_value)
 
     ## write the encoded data length
-    write(fileio, encoded_defn_data_length)
+    write(fileio, encoded_defn_data_length |> UInt32 |> htol)
 
     write(fileio, rle_header)
     write(fileio, repeated_value)
@@ -593,7 +593,7 @@ function _write_parquet(itr_vectors, colnames, path, nchunks; ncols = length(itr
 
     filemetadata_size = write_thrift(fileio, filemetadata)
 
-    write(fileio, UInt32(filemetadata_size))
+    write(fileio, UInt32(filemetadata_size) |> htol)
     write(fileio, "PAR1")
     close(fileio)
 end
diff --git a/test/test_writer.jl b/test/test_writer.jl
index dddafc1..5bb9ca8 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -35,6 +35,7 @@ function test_write()
 
 
     for (colnum, col_chunk) in enumerate(col_chunks)
+        println(colnum)
         correct_vals = tbl[colnum]
         coltype = eltype(correct_vals)
         vals_from_file = values(pf, col_chunk)
@@ -55,4 +56,4 @@ function test_write()
     close(pf)
 end
 
-# test_write()
+test_write()

From dda544cfeb9bff1347402adbfd6ea73fa70f6db4 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 23 May 2020 01:33:18 +1000
Subject: [PATCH 36/52] minor

---
 src/column_reader.jl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index d91d20a..7a07a08 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -75,5 +75,3 @@ function read_column(path, filemetadata, col_num)
     end
     return res
 end
-
-()

From 1930cc776a29d718237062c6902dd0f8a15fc8f5 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 23 May 2020 01:52:16 +1000
Subject: [PATCH 37/52] fixed test

---
 test/test_writer.jl | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index 5bb9ca8..795252d 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -33,9 +33,18 @@ function test_write()
     # the file is very small so only one rowgroup
     col_chunks = columns(pf, 1)
 
+    colnum=12
+    col_chunk=col_chunks[colnum]
+
+    correct_vals = tbl[colnum]
+    coltype = eltype(correct_vals)
+    vals_from_file = values(pf, col_chunk)
+
+    if Missing <: coltype
+        @test ismissing.(correct_vals) == (vals_from_file[2] .== 0)
+    end
 
     for (colnum, col_chunk) in enumerate(col_chunks)
-        println(colnum)
         correct_vals = tbl[colnum]
         coltype = eltype(correct_vals)
         vals_from_file = values(pf, col_chunk)
@@ -43,10 +52,12 @@ function test_write()
             @test ismissing.(correct_vals) == (vals_from_file[2] .== 0)
         end
 
+        non_missing_vals = collect(skipmissing(correct_vals))
+
         if nonmissingtype(coltype) == String
-            @test all(skipmissing(correct_vals) .== String.(vals_from_file[1]))
+            non_missing_vals_read = String.(vals_from_file[1][1:sum(vals_from_file[2])])
+            @test all(non_missing_vals .== non_missing_vals_read)
         else
-            non_missing_vals = collect(skipmissing(correct_vals))
             non_missing_vals_read = vals_from_file[1][1:sum(vals_from_file[2])]
             @test all(non_missing_vals .== non_missing_vals_read)
         end

From 58e79202d0db2d3e0f222ae733549ab1cd84f856 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sat, 23 May 2020 02:00:22 +1000
Subject: [PATCH 38/52] Update src/Parquet.jl

---
 src/Parquet.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Parquet.jl b/src/Parquet.jl
index 421a0bb..18f0830 100644
--- a/src/Parquet.jl
+++ b/src/Parquet.jl
@@ -11,7 +11,7 @@ if VERSION < v"1.3"
     using Missings: nonmissingtype
 end
 
-const PARQUET_JL_VERSION = v"0.5.1"
+const PARQUET_JL_VERSION = v"0.5.3"
 
 import Base: show, open, close, values, eltype, length
 import Thrift: isfilled

From 21d645fe97aaa28248bfd9c002396cc45a14fd77 Mon Sep 17 00:00:00 2001
From: evalparse <zhuojia.dai@gmail.com>
Date: Sat, 23 May 2020 02:00:56 +1000
Subject: [PATCH 39/52] Update test/test_writer.jl

---
 test/test_writer.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index 795252d..51718b1 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -33,7 +33,6 @@ function test_write()
     # the file is very small so only one rowgroup
     col_chunks = columns(pf, 1)
 
-    colnum=12
     col_chunk=col_chunks[colnum]
 
     correct_vals = tbl[colnum]

From 54c5f0ca1f3048ccf6f26a57339fe494bc307b61 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 23 May 2020 02:04:00 +1000
Subject: [PATCH 40/52] minor fix

---
 test/test_writer.jl | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/test/test_writer.jl b/test/test_writer.jl
index 795252d..3887e80 100644
--- a/test/test_writer.jl
+++ b/test/test_writer.jl
@@ -33,17 +33,6 @@ function test_write()
     # the file is very small so only one rowgroup
     col_chunks = columns(pf, 1)
 
-    colnum=12
-    col_chunk=col_chunks[colnum]
-
-    correct_vals = tbl[colnum]
-    coltype = eltype(correct_vals)
-    vals_from_file = values(pf, col_chunk)
-
-    if Missing <: coltype
-        @test ismissing.(correct_vals) == (vals_from_file[2] .== 0)
-    end
-
     for (colnum, col_chunk) in enumerate(col_chunks)
         correct_vals = tbl[colnum]
         coltype = eltype(correct_vals)

From 6a94305a40de667cb8979c66da2d593fbac1e15e Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 23 May 2020 12:17:52 +1000
Subject: [PATCH 41/52] minor:

---
 .gitignore | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0d9aedb..c4f35ef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,9 +4,3 @@
 parquet-compatibility/
 julia-parquet-compatibility/
 .vscode/settings.json
-Manifest.toml
-parquet.code-workspace
-src/column_reader_rewrite.jl
-src/column_reader_to_vals.jl
-src/column_reader-dev.jl
-src/read_parquet-test.jl

From 7046f921f7e73b1ce1e566bad8395c5660e81c35 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 23 May 2020 12:26:43 +1000
Subject: [PATCH 42/52] so i dont lose it

---
 src/column_reader_dev.jl | 164 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 src/column_reader_dev.jl

diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
new file mode 100644
index 0000000..92e55c5
--- /dev/null
+++ b/src/column_reader_dev.jl
@@ -0,0 +1,164 @@
+
+
+using Random: randstring
+test_write1() = begin
+    tbl = (
+        int32 = rand(Int32, 1000),
+        int64 = rand(Int64, 1000),
+        float32 = rand(Float32, 1000),
+        float64 = rand(Float64, 1000),
+        bool = rand(Bool, 1000),
+        string = [randstring(8) for i in 1:1000],
+        int32m = rand([missing, rand(Int32, 10)...], 1000),
+        int64m = rand([missing, rand(Int64, 10)...], 1000),
+        float32m = rand([missing, rand(Float32, 10)...], 1000),
+        float64m = rand([missing, rand(Float64, 10)...], 1000),
+        boolm = rand([missing, true, false], 1000),
+        stringm = rand([missing, "abc", "def", "ghi"], 1000)
+    )
+
+    write_parquet("c:/scratch/plsdel.parquet", tbl)
+end
+
+test_write1()
+
+par = ParFile(path)
+
+T = TYPES[filemetadata.schema[col_num+1]._type+1]
+# TODO detect if missing is necessary
+res = Vector{Union{Missing, T}}(missing, nrows(par))
+write_cursor = 1
+for row_group in filemetadata.row_groups
+    pgs = pages(par, row_group.columns[col_num])
+
+    drop_page_count = 0
+    # is the first page a dictionary page
+    # this is not the case for boolean values for example
+    if isfilled(pgs[1].hdr, :dictionary_page_header)
+        # the first page is almost always the dictionary page
+        dictionary_page = pgs[1]
+        drop_page_count = 1
+        dictionary_of_values = T.(values(par, dictionary_page)[1])
+    end
+
+    # TODO deal with other types of pages e.g. dataheaderv2
+
+    # everything after the first data datapages
+    for data_page in Base.Iterators.drop(pgs, drop_page_count)
+        vals, definitions, decode = values(par, data_page)
+
+        @assert all(in((0, 1)), definitions)
+
+        l = sum(==(1), definitions)
+        # if all definitions values are 1 then it's not used
+        definitions_not_used = all(==(1), definitions)
+
+        # data_page can be either
+        # * dictionary-encoded in which case we should look into the dictionary
+        # * plained-encoded in which case just return the values
+        page_encoding = Parquet.page_encoding(data_page)
+
+        if page_encoding == Encoding.PLAIN_DICTIONARY
+            if definitions_not_used
+                res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1]
+            else
+                val_index = 1
+                for (offset, definition) in enumerate(definitions)
+                    if definition != 0
+                        value = vals[val_index]
+                        res[write_cursor+offset-1] = dictionary_of_values[value + 1]
+                        val_index += 1
+                    end
+                end
+            end
+        elseif page_encoding == Encoding.PLAIN
+            if definitions_not_used
+                res[write_cursor:write_cursor+l-1] .= T.(vals)
+            else
+                val_index = 1
+                for (offset, definition)  in enumerate(definitions)
+                    if definition != 0
+                        value = vals[val_index]
+                        res[write_cursor+offset-1] = T(value)
+                        val_index += 1
+                    end
+                end
+            end
+        else
+            error("page encoding not supported yet")
+        end
+
+        write_cursor += length(definitions)
+    end
+end
+return res
+par = ParFile(path)
+
+T = TYPES[filemetadata.schema[col_num+1]._type+1]
+# TODO detect if missing is necessary
+res = Vector{Union{Missing, T}}(missing, nrows(par))
+write_cursor = 1
+for row_group in filemetadata.row_groups
+    pgs = pages(par, row_group.columns[col_num])
+
+    drop_page_count = 0
+    # is the first page a dictionary page
+    # this is not the case for boolean values for example
+    if isfilled(pgs[1].hdr, :dictionary_page_header)
+        # the first page is almost always the dictionary page
+        dictionary_page = pgs[1]
+        drop_page_count = 1
+        dictionary_of_values = T.(values(par, dictionary_page)[1])
+    end
+
+    # TODO deal with other types of pages e.g. dataheaderv2
+
+    # everything after the first data datapages
+    for data_page in Base.Iterators.drop(pgs, drop_page_count)
+        vals, definitions, decode = values(par, data_page)
+
+        @assert all(in((0, 1)), definitions)
+
+        l = sum(==(1), definitions)
+        # if all definitions values are 1 then it's not used
+        definitions_not_used = all(==(1), definitions)
+
+        # data_page can be either
+        # * dictionary-encoded in which case we should look into the dictionary
+        # * plained-encoded in which case just return the values
+        page_encoding = Parquet.page_encoding(data_page)
+
+        if page_encoding == Encoding.PLAIN_DICTIONARY
+            if definitions_not_used
+                res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1]
+            else
+                val_index = 1
+                for (offset, definition) in enumerate(definitions)
+                    if definition != 0
+                        value = vals[val_index]
+                        res[write_cursor+offset-1] = dictionary_of_values[value + 1]
+                        val_index += 1
+                    end
+                end
+            end
+        elseif page_encoding == Encoding.PLAIN
+            if definitions_not_used
+                res[write_cursor:write_cursor+l-1] .= T.(vals)
+            else
+                val_index = 1
+                for (offset, definition)  in enumerate(definitions)
+                    if definition != 0
+                        value = vals[val_index]
+                        res[write_cursor+offset-1] = T(value)
+                        val_index += 1
+                    end
+                end
+            end
+        else
+            error("page encoding not supported yet")
+        end
+
+        write_cursor += length(definitions)
+    end
+end
+return res

From 2331e995c6fe1ae5c63d5f56ed5bff539f459ef8 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 25 May 2020 00:56:52 +1000
Subject: [PATCH 43/52] got a copy based reader working

---
 src/column_reader.jl     | 273 ++++++++++++++++++++++++++++++---------
 src/column_reader_dev.jl | 227 ++++++++++++--------------------
 2 files changed, 293 insertions(+), 207 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index 7a07a08..1589872 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -1,77 +1,230 @@
+import Base: iterate, length, IteratorSize, IteratorEltype, eltype
 
 const TYPES = (Bool, Int32, Int64, Int128, Float32, Float64, String, UInt8)
 
-read_column(path, col_num) = read_column(path, metadata(path), col_num)
+struct BitPackedIterator
+    data::Vector{UInt8}
+    bitwidth::Int32
+end
+
+
+iterate(bp::BitPackedIterator) = iterate(bp::BitPackedIterator, 1)
+
+length(bp::BitPackedIterator) = div(8*length(bp.data), bp.bitwidth)
+
+IteratorSize(::Type{BitPackedIterator}) = Base.HasLength()
+IteratorEltype(::Type{BitPackedIterator}) = Base.HasEltype()
+eltype(::Type{BitPackedIterator}) = UInt
+
+function iterate(bp::BitPackedIterator, state)
+    end_bit = state * bp.bitwidth
+    end_byte = ceil(Int, end_bit / 8)
+
+    if end_byte > length(bp.data)
+        return nothing
+    end
 
-function read_column(path, filemetadata, col_num)
+    start_bit = (state - 1) * bp.bitwidth + 1
+
+    start_byte, bits_to_drop = divrem(start_bit-1, 8)
+
+    start_byte += 1
+    bits_to_drop = bits_to_drop
+
+    # start bit shift the value
+    value = UInt(0)
+
+    @inbounds for byte in @view bp.data[end_byte:-1:start_byte]
+        value = (value << 8) | byte
+    end
+
+    value >>= bits_to_drop
+
+    (value & UInt(2^bp.bitwidth-1), state + 1)
+end
+
+function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UInt8}
+    if codec == PAR2.CompressionCodec.SNAPPY
+        uncompressed_data = Snappy.uncompress(compressed_data)
+    else
+        error("codedc $codec unsupported atm")
+    end
+end
+
+zero_or_missing(::Type{String}) = missing
+zero_or_missing(::Type{T}) where T = zero(T)
+
+function read_column(path, col_num)
+    filemetadata = Parquet.metadata(path)
     par = ParFile(path)
+    fileio = open(path)
 
     T = TYPES[filemetadata.schema[col_num+1]._type+1]
+
     # TODO detect if missing is necessary
-    res = Vector{Union{Missing, T}}(missing, nrows(par))
-    write_cursor = 1
+    res = Vector{Union{Missing, T}}(undef, nrows(par))
+    res .= zero_or_missing(T)
+
+    length(filemetadata.row_groups)
+
+    from = 1
+    last_from = from
     for row_group in filemetadata.row_groups
-        pgs = pages(par, row_group.columns[col_num])
-
-        drop_page_count = 0
-        # is the first page a dictionary page
-        # this is not the case for boolean values for example
-        if isfilled(pgs[1].hdr, :dictionary_page_header)
-            # the first page is almost always the dictionary page
-            dictionary_page = pgs[1]
-            drop_page_count = 1
-            dictionary_of_values = T.(values(par, dictionary_page)[1])
+        colchunk_meta = row_group.columns[col_num].meta_data
+
+        if isfilled(colchunk_meta, :dictionary_page_offset)
+            seek(fileio, colchunk_meta.dictionary_page_offset)
+            dict_page_header = read_thrift(fileio, PAR2.PageHeader)
+            compressed_data = read(fileio, dict_page_header.compressed_page_size)
+            uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec)
+            @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size
+
+            if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
+                # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8
+                # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0
+                dict = reinterpret(T, uncompressed_data)
+            else
+                error("Only Plain Dictionary encoding is supported")
+            end
+        else
+            dict = nothing
         end
 
-        # TODO deal with other types of pages e.g. dataheaderv2
-
-        # everything after the first data datapages
-        for data_page in Base.Iterators.drop(pgs, drop_page_count)
-            vals, definitions, decode = values(par, data_page)
-
-            @assert all(in((0, 1)), definitions)
-
-            l = sum(==(1), definitions)
-            # if all definitions values are 1 then it's not used
-            definitions_not_used = all(==(1), definitions)
-
-            # data_page can be either
-            # * dictionary-encoded in which case we should look into the dictionary
-            # * plained-encoded in which case just return the values
-            page_encoding = Parquet.page_encoding(data_page)
-
-            if page_encoding == Encoding.PLAIN_DICTIONARY
-                if definitions_not_used
-                    res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1]
-                else
-                    val_index = 1
-                    for (offset, definition) in enumerate(definitions)
-                        if definition != 0
-                            value = vals[val_index]
-                            res[write_cursor+offset-1] = dictionary_of_values[value + 1]
-                            val_index += 1
-                        end
-                    end
-                end
-            elseif page_encoding == Encoding.PLAIN
-                if definitions_not_used
-                    res[write_cursor:write_cursor+l-1] .= T.(vals)
-                else
-                    val_index = 1
-                    for (offset, definition)  in enumerate(definitions)
-                        if definition != 0
-                            value = vals[val_index]
-                            res[write_cursor+offset-1] = T(value)
-                            val_index += 1
-                        end
-                    end
+        # seek to the first data page
+        seek(fileio, colchunk_meta.data_page_offset)
+
+        # repeated read data page
+        while from - last_from  < row_group.num_rows
+            from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1
+        end
+        last_from = from
+    end
+
+    res
+end
+
+function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integer = 1)
+    """
+    This function assumes
+    """
+
+    # the result length is used latter on to prevent writing too much data
+    res_len = length(res)
+
+    to = from # intialise to something
+
+    data_page_header = read_thrift(fileio, PAR2.PageHeader)
+    compressed_data = read(fileio, data_page_header.compressed_page_size)
+    uncompressed_data = decompress_with_codec(compressed_data, codec)
+    @assert length(uncompressed_data) == data_page_header.uncompressed_page_size
+
+    # this is made up of these 3 things written back to back
+    # * repetition levels - can be ignored for unnested data
+    # * definition levels -
+    # * values
+
+    # definition levels
+    # do_read_defn_lvls = isfilled(data_page_header.data_page_header, :statistics) &&
+    #     isfilled(data_page_header.data_page_header.statistics, :null_count) &&
+    #     data_page_header.data_page_header.statistics.null_count > 0
+    uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false)
+
+    if data_page_header.data_page_header.definition_level_encoding == PAR2.Encoding.RLE
+        # for unnested columns the highest possible value for definiton is 1
+        # which can represented with just one bit so the bit width is always 1
+        bitwidth = 1
+        encoded_data_len = read(uncompressed_data_io, UInt32)
+        pos_before_encoded_data = position(uncompressed_data_io)
+        encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
+
+        if iseven(encoded_data_header)
+            # RLE encoded
+            rle_len = Int(encoded_data_header >> 1)
+            rle_val = read(uncompressed_data_io, 1)
+            pos_after_reading_encoded_data = position(uncompressed_data_io)
+        else
+            # bitpacked encoded
+            bit_pack_len = Int(encoded_data_header >> 1)
+        end
+    else
+        error("encoding not supported")
+    end
+
+    @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len
+
+    # this is how many values should have been read
+    num_values_check = data_page_header.data_page_header.num_values
+
+    # valuess
+    if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN
+        # just return the data as is
+        # TODO would it better if take! is done?
+
+        if T == Bool
+            # for boolean every bit is a value so the length is 8 times
+            digits(UInt8, read(uncompressed_data_io), base=2)
+            len_raw_data = 8length(raw_data)
+        else
+            pos_for_pointer = position(uncompressed_data_io) + 1
+            src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer))
+            dest_ptr = Ptr{T}(pointer(res, from))
+            # copy content over
+            GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values_check)
+            to = min(from + num_values_check - 1, res_len)
+
+
+            # raw_data = reinterpret(T, read(uncompressed_data_io))
+            # len_raw_data = length(raw_data)
+            # to = min(from + len_raw_data - 1, res_len)
+            #res[from:to] .= raw_data
+        end
+    elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
+        # this means the data is encoded in integers format which form the indices to the data
+        bitwidth = Int(read(uncompressed_data_io, UInt8))
+
+        # the documented max bitwidth is
+        @assert bitwidth <= 32
+
+        while !eof(uncompressed_data_io)
+            # println(position(uncompressed_data_io))
+            encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
+
+            if iseven(encoded_data_header)
+                # RLE encoded
+                rle_len = Int(encoded_data_header >> 1)
+                rle_val_vec::Vector{UInt8} = read(uncompressed_data_io, ceil(Int, bitwidth/8))
+                rle_val = UInt(0)
+
+                for tmp in @view rle_val_vec[end:-1:1]
+                    rle_val = rle_val << 8
+                    rle_val = rle_val | tmp
                 end
+
+                to = min(from + rle_len - 1, res_len)
+                res[from:to] .= dict[rle_val+1]
+
+                from = from + rle_len
             else
-                error("page encoding not supported yet")
-            end
+                # bitpacked encoded
+                bit_pack_len = Int(encoded_data_header >> 1)
+                @assert (bit_pack_len >= 1) && (bit_pack_len <= 2^31 - 1)
+                bytes_to_read = bitwidth*bit_pack_len
+                data = read(uncompressed_data_io, bytes_to_read)
+                bp = BitPackedIterator(data, bitwidth)
+                # now need a decoding algorithm to break it up
+                # reading `bitwidth` bits at a time
+                l = length(bp)
+                to = min(from + l - 1, res_len)
 
-            write_cursor += length(definitions)
+                for (v, i) in zip(bp, from:to)
+                    res[i] = dict[v+1]
+                end
+                from = from + l
+            end
         end
+    else
+        erorr("encoding not supported")
     end
-    return res
+
+    to
 end
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index 92e55c5..84c4ecd 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -1,164 +1,97 @@
+using Parquet
+using Parquet:TYPES, read_thrift, PAR2, BitPackedIterator, decompress_with_codec
+using Thrift: isfilled
+using Snappy, CodecZlib, CodecZstd
 
+path = "c:/git/parquet-data-collection/dsd50p.parquet"
+path = "c:/data/Performance_2003Q3.txt.parquet"
+
+meta = Parquet.metadata(path);
+par = ParFile(path);
+
+nrows(par)
+
+colnames(par)
 
 using Random: randstring
-test_write1() = begin
-    tbl = (
-        int32 = rand(Int32, 1000),
-        int64 = rand(Int64, 1000),
-        float32 = rand(Float32, 1000),
-        float64 = rand(Float64, 1000),
-        bool = rand(Bool, 1000),
-        string = [randstring(8) for i in 1:1000],
-        int32m = rand([missing, rand(Int32, 10)...], 1000),
-        int64m = rand([missing, rand(Int64, 10)...], 1000),
-        float32m = rand([missing, rand(Float32, 10)...], 1000),
-        float64m = rand([missing, rand(Float64, 10)...], 1000),
-        boolm = rand([missing, true, false], 1000),
-        stringm = rand([missing, "abc", "def", "ghi"], 1000)
-    )
-
-    write_parquet("c:/scratch/plsdel.parquet", tbl)
+tbl = (
+    int32 = rand(Int32, 1000),
+    int64 = rand(Int64, 1000),
+    float32 = rand(Float32, 1000),
+    float64 = rand(Float64, 1000),
+    bool = rand(Bool, 1000),
+    string = [randstring(8) for i in 1:1000],
+    int32m = rand([missing, rand(Int32, 10)...], 1000),
+    int64m = rand([missing, rand(Int64, 10)...], 1000),
+    float32m = rand([missing, rand(Float32, 10)...], 1000),
+    float64m = rand([missing, rand(Float64, 10)...], 1000),
+    boolm = rand([missing, true, false], 1000),
+    stringm = rand([missing, "abc", "def", "ghi"], 1000)
+)
+
+tmpfile = tempname()*".parquet"
+
+write_parquet(tmpfile, tbl)
+
+path = tmpfile
+
+for i in 1:12
+    @time col1 = Parquet.read_column(path, i);
 end
 
-test_write1()
+@time col1 = Parquet.read_column(path, 1)
+col1 == tbl.int32
 
+col_num = 5
+
+filemetadata = Parquet.metadata(path)
 par = ParFile(path)
+fileio = open(path)
 
 T = TYPES[filemetadata.schema[col_num+1]._type+1]
+
 # TODO detect if missing is necessary
 res = Vector{Union{Missing, T}}(missing, nrows(par))
-write_cursor = 1
-for row_group in filemetadata.row_groups
-    pgs = pages(par, row_group.columns[col_num])
-
-    drop_page_count = 0
-    # is the first page a dictionary page
-    # this is not the case for boolean values for example
-    if isfilled(pgs[1].hdr, :dictionary_page_header)
-        # the first page is almost always the dictionary page
-        dictionary_page = pgs[1]
-        drop_page_count = 1
-        dictionary_of_values = T.(values(par, dictionary_page)[1])
-    end
 
-    # TODO deal with other types of pages e.g. dataheaderv2
-
-    # everything after the first data datapages
-    for data_page in Base.Iterators.drop(pgs, drop_page_count)
-        vals, definitions, decode = values(par, data_page)
-
-        @assert all(in((0, 1)), definitions)
-
-        l = sum(==(1), definitions)
-        # if all definitions values are 1 then it's not used
-        definitions_not_used = all(==(1), definitions)
-
-        # data_page can be either
-        # * dictionary-encoded in which case we should look into the dictionary
-        # * plained-encoded in which case just return the values
-        page_encoding = Parquet.page_encoding(data_page)
-
-        if page_encoding == Encoding.PLAIN_DICTIONARY
-            if definitions_not_used
-                res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1]
-            else
-                val_index = 1
-                for (offset, definition) in enumerate(definitions)
-                    if definition != 0
-                        value = vals[val_index]
-                        res[write_cursor+offset-1] = dictionary_of_values[value + 1]
-                        val_index += 1
-                    end
-                end
-            end
-        elseif page_encoding == Encoding.PLAIN
-            if definitions_not_used
-                res[write_cursor:write_cursor+l-1] .= T.(vals)
-            else
-                val_index = 1
-                for (offset, definition)  in enumerate(definitions)
-                    if definition != 0
-                        value = vals[val_index]
-                        res[write_cursor+offset-1] = T(value)
-                        val_index += 1
-                    end
-                end
-            end
-        else
-            error("page encoding not supported yet")
-        end
-
-        write_cursor += length(definitions)
+length(filemetadata.row_groups)
+
+from = 1
+last_from = from
+
+row_group = filemetadata.row_groups[1]
+
+colchunk_meta = row_group.columns[col_num].meta_data
+
+if isfilled(colchunk_meta, :dictionary_page_offset)
+    seek(fileio, colchunk_meta.dictionary_page_offset)
+    dict_page_header = read_thrift(fileio, PAR2.PageHeader)
+    compressed_data = read(fileio, dict_page_header.compressed_page_size)
+    uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec)
+    @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size
+
+    if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
+        # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8
+        # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0
+        dict = reinterpret(T, uncompressed_data)
+    else
+        error("Only Plain Dictionary encoding is supported")
     end
+else
+    dict = nothing
 end
-return res
-par = ParFile(path)
 
-T = TYPES[filemetadata.schema[col_num+1]._type+1]
-# TODO detect if missing is necessary
-res = Vector{Union{Missing, T}}(missing, nrows(par))
-write_cursor = 1
-for row_group in filemetadata.row_groups
-    pgs = pages(par, row_group.columns[col_num])
-
-    drop_page_count = 0
-    # is the first page a dictionary page
-    # this is not the case for boolean values for example
-    if isfilled(pgs[1].hdr, :dictionary_page_header)
-        # the first page is almost always the dictionary page
-        dictionary_page = pgs[1]
-        drop_page_count = 1
-        dictionary_of_values = T.(values(par, dictionary_page)[1])
-    end
+# seek to the first data page
+seek(fileio, colchunk_meta.data_page_offset)
 
-    # TODO deal with other types of pages e.g. dataheaderv2
-
-    # everything after the first data datapages
-    for data_page in Base.Iterators.drop(pgs, drop_page_count)
-        vals, definitions, decode = values(par, data_page)
-
-        @assert all(in((0, 1)), definitions)
-
-        l = sum(==(1), definitions)
-        # if all definitions values are 1 then it's not used
-        definitions_not_used = all(==(1), definitions)
-
-        # data_page can be either
-        # * dictionary-encoded in which case we should look into the dictionary
-        # * plained-encoded in which case just return the values
-        page_encoding = Parquet.page_encoding(data_page)
-
-        if page_encoding == Encoding.PLAIN_DICTIONARY
-            if definitions_not_used
-                res[write_cursor:write_cursor+l-1] .= dictionary_of_values[vals.+1]
-            else
-                val_index = 1
-                for (offset, definition) in enumerate(definitions)
-                    if definition != 0
-                        value = vals[val_index]
-                        res[write_cursor+offset-1] = dictionary_of_values[value + 1]
-                        val_index += 1
-                    end
-                end
-            end
-        elseif page_encoding == Encoding.PLAIN
-            if definitions_not_used
-                res[write_cursor:write_cursor+l-1] .= T.(vals)
-            else
-                val_index = 1
-                for (offset, definition)  in enumerate(definitions)
-                    if definition != 0
-                        value = vals[val_index]
-                        res[write_cursor+offset-1] = T(value)
-                        val_index += 1
-                    end
-                end
-            end
-        else
-            error("page encoding not supported yet")
-        end
-
-        write_cursor += length(definitions)
-    end
+pg = read_thrift(fileio, PAR2.PageHeader)
+
+Parquet.read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from)
+
+# repeated read data page
+while from - last_from  < row_group.num_rows
+    from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1
 end
-return res
+last_from = from
+
+
+res

From 87160411c2702d09d18c86b275a5e91e8c0fd65a Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Mon, 25 May 2020 01:25:56 +1000
Subject: [PATCH 44/52] minor

copying memory is much faster
---
 src/column_reader.jl     |  8 +-------
 src/column_reader_dev.jl | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index 1589872..ce64846 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -51,9 +51,6 @@ function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UI
     end
 end
 
-zero_or_missing(::Type{String}) = missing
-zero_or_missing(::Type{T}) where T = zero(T)
-
 function read_column(path, col_num)
     filemetadata = Parquet.metadata(path)
     par = ParFile(path)
@@ -63,9 +60,6 @@ function read_column(path, col_num)
 
     # TODO detect if missing is necessary
     res = Vector{Union{Missing, T}}(undef, nrows(par))
-    res .= zero_or_missing(T)
-
-    length(filemetadata.row_groups)
 
     from = 1
     last_from = from
@@ -176,7 +170,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
             # raw_data = reinterpret(T, read(uncompressed_data_io))
             # len_raw_data = length(raw_data)
             # to = min(from + len_raw_data - 1, res_len)
-            #res[from:to] .= raw_data
+            # res[from:to] .= raw_data
         end
     elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
         # this means the data is encoded in integers format which form the indices to the data
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index 84c4ecd..82c05a1 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -42,6 +42,28 @@ end
 @time col1 = Parquet.read_column(path, 1)
 col1 == tbl.int32
 
+using BenchmarkTools
+
+@benchmark Parquet.read_column($path, 1)
+
+
+
+
+
+
+
+
+
+@benchmark Parquet.read_column($path, 1)
+
+
+
+
+
+
+
+
+
 col_num = 5
 
 filemetadata = Parquet.metadata(path)

From 327c66ef605a6a2a34cae963a18ac5779d11192c Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Tue, 26 May 2020 23:29:21 +1000
Subject: [PATCH 45/52] fixed most of the non dictionary value reads

---
 Project.toml             |   2 +
 src/column_reader.jl     | 187 +++++++++++++++++++++++++++++++--------
 src/column_reader_dev.jl | 106 +++++++---------------
 3 files changed, 188 insertions(+), 107 deletions(-)

diff --git a/Project.toml b/Project.toml
index 0f1ce6a..86dc08f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -14,6 +14,8 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 LittleEndianBase128 = "1724a1d5-ab78-548d-94b3-135c294f96cf"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+NamedTupleTools = "d9ec5142-1e00-5aa0-9d6a-321866360f50"
+ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Snappy = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Thrift = "8d9c9c80-f77e-5080-9541-c6f69d204e22"
diff --git a/src/column_reader.jl b/src/column_reader.jl
index 1773192..77b6b03 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -59,10 +59,18 @@ function read_column(path, col_num)
     T = TYPES[filemetadata.schema[col_num+1]._type+1]
 
     # TODO detect if missing is necessary
-    res = Vector{Union{Missing, T}}(undef, nrows(par))
+    if T == String
+        # the memory structure of String is different to other supported types
+        # so it's better to initialise it with missing
+        res = Vector{Union{Missing, String}}(missing, nrows(par))
+    else
+        res = Vector{Union{Missing, T}}(undef, nrows(par))
+    end
 
     from = 1
     last_from = from
+
+    j = 1
     for row_group in filemetadata.row_groups
         colchunk_meta = row_group.columns[col_num].meta_data
 
@@ -76,7 +84,18 @@ function read_column(path, col_num)
             if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
                 # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8
                 # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0
-                dict = reinterpret(T, uncompressed_data)
+                if T == String
+                    dict = Vector{String}(undef, dict_page_header.dictionary_page_header.num_values)
+                    uncompressed_data_io = IOBuffer(uncompressed_data)
+                    j = 1
+                    while !eof(uncompressed_data_io)
+                        str_len = read(uncompressed_data_io, UInt32)
+                        dict[j] = String(read(uncompressed_data_io, str_len))
+                        j += 1
+                    end
+                else
+                    dict = reinterpret(T, uncompressed_data)
+                end
             else
                 error("Only Plain Dictionary encoding is supported")
             end
@@ -88,10 +107,15 @@ function read_column(path, col_num)
         seek(fileio, colchunk_meta.data_page_offset)
 
         # repeated read data page
-        while from - last_from  < row_group.num_rows
+
+        while (from - last_from  < row_group.num_rows) & (from <= length(res))
             from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1
         end
         last_from = from
+
+        # (j == 1) && return res
+        j += 1
+
     end
 
     res
@@ -99,7 +123,7 @@ end
 
 function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integer = 1)
     """
-    This function assumes
+    Read one data page
     """
 
     # the result length is used latter on to prevent writing too much data
@@ -119,6 +143,12 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
 
     uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false)
 
+    # this will be set in future
+    has_missing = false
+
+    # the number of values stored in this page
+    num_values = data_page_header.data_page_header.num_values
+
     # definition levels
     # do_read_defn_lvls = isfilled(data_page_header.data_page_header, :statistics) &&
     #     isfilled(data_page_header.data_page_header.statistics, :null_count) &&
@@ -131,12 +161,28 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
         pos_before_encoded_data = position(uncompressed_data_io)
         encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
 
+        # TODO it's possible to be mixing RLE and bitpacked in one algorithm
         if iseven(encoded_data_header)
             # RLE encoded
             rle_len = Int(encoded_data_header >> 1)
-            rle_val = read(uncompressed_data_io, 1)
+            rle_val = read(uncompressed_data_io, UInt8)
+
             pos_after_reading_encoded_data = position(uncompressed_data_io)
+
+            if T == String
+                # strings memoery are stored differently so can't benefit from this
+            else
+                # fill the memory location with all missing
+                GC.@preserve res begin
+                    dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1
+                    tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, num_values)
+                    fill!(tmparray, rle_val)
+                end
+            end
         else
+            # the only reaosn to use bitpacking is because there are missings
+            has_missing = true
+
             # bitpacked encoded
             bit_pack_len = Int(encoded_data_header >> 1)
 
@@ -146,51 +192,122 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
             pos_after_reading_encoded_data = position(uncompressed_data_io)
 
             # the structure of Vector{Union{T, Missing}} is
-            # * the T values first
+            # * the `values::T` first
             # * the missing are stored with UInt8(0) for missing
             # * and UInt8(1) otherwise
             # see https://docs.julialang.org/en/v1/devdocs/isbitsunionarrays/
-            missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect
 
-            src_ptr = Ptr{UInt8}(pointer(missing_bytes))
-            dest_ptr = Ptr{UInt8}(pointer(res, res_len+1))
+            # TODO I suspect this is not the fastest way to unpack bitwidth = 1
+            # data
+            @assert bitwidth == 1
+            bp = BitPackedIterator(data, bitwidth)
+
+            missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect
 
-            # copy content over
-            GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, res_len)
+            if T == String
+                # do nothing
+            else
+                GC.@preserve missing_bytes res begin
+                    src_ptr = Ptr{UInt8}(pointer(missing_bytes))
+                    dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1
+                    # copy content over
+                    unsafe_copyto!(dest_ptr, src_ptr, res_len)
+                end
+            end
         end
     else
-        error("encoding not supported")
+        error("no definition encoding not supported")
     end
 
+    # this line ensures that we have read all the encoded definition data
     @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len
 
-    # this is how many values should have been read
-    num_values_check = data_page_header.data_page_header.num_values
-
-    # valuess
+    # read values
     if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN
         # just return the data as is
-        # TODO would it better if take! is done?
-
         if T == Bool
-            # for boolean every bit is a value so the length is 8 times
-            digits(UInt8, read(uncompressed_data_io), base=2)
-            len_raw_data = 8length(raw_data)
+            to = min(from + num_values - 1, res_len)
+
+            if has_missing
+                upto = 1
+                raw_data = Vector{Bool}(undef, 8)
+                for (i, missing_byte) in zip(from:to, missing_bytes)
+                    if missing_byte == 1
+                        if upto == 1
+                            digits!(raw_data, read(uncompressed_data_io, UInt8), base=2)
+                        end
+                        res[i] = raw_data[upto]
+                        upto += 1
+                        if upto == 9
+                            upto = 1
+                        end
+                    end
+                end
+            else
+                # for boolean every bit is a value so the length is 8 times
+                i = from
+                while !eof(uncompressed_data_io)
+                    udi = read(uncompressed_data_io, UInt8)
+                    raw_data = Base.unsafe_wrap(Vector{Bool}, pointer(res, i) |>  Ptr{Bool}, (8,))
+                    digits!(raw_data, udi, base=2)
+
+                    if i + 8 - 1 <= res_len
+                        digits!(raw_data, udi, base=2)
+                        i += 8
+                    else
+                        for rd in digits(Bool, udi, base=2, pad = 8)
+                            if i <= res_len
+                                res[i] = rd
+                            end
+                            i += 1
+                        end
+                    end
+                end
+            end
+        elseif T == String
+            to = min(from + num_values - 1, res_len)
+            if has_missing
+                for (i, missing_byte) in zip(from:to, missing_bytes)
+                    if missing_byte == 1
+                        # 1 means not missing
+                        str_len = read(uncompressed_data_io, UInt32)
+                        res[i] = String(read(uncompressed_data_io, str_len))
+                    end
+                end
+            else
+                i = from
+                while !eof(uncompressed_data_io)
+                    str_len = read(uncompressed_data_io, UInt32)
+                    res[i] = String(read(uncompressed_data_io, str_len))
+                    i = i + 1
+                end
+            end
+
         else
-            # the copying approach is alot faster than the commented out
-            # assignment approach
-            pos_for_pointer = position(uncompressed_data_io) + 1
-            src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer))
-            dest_ptr = Ptr{T}(pointer(res, from))
-            # copy content over
-            GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values_check)
-            to = min(from + num_values_check - 1, res_len)
-
-
-            # raw_data = reinterpret(T, read(uncompressed_data_io))
-            # len_raw_data = length(raw_data)
-            # to = min(from + len_raw_data - 1, res_len)
-            # @inbounds res[from:to] .= raw_data
+            if has_missing
+                raw_data = reinterpret(T, read(uncompressed_data_io))
+                to = min(from + num_values - 1, res_len)
+
+                j = 1
+                for (i, missing_byte) in zip(from:to, missing_bytes)
+                    if missing_byte == 1
+                        # 1 means not missing
+                        res[i] = raw_data[j]
+                        j += 1
+                    end
+                end
+            else
+                # if there is no missing, can just copy the data into the
+                # right memory location
+                # the copying approach is alot faster than the commented out
+                # assignment approach
+                pos_for_pointer = position(uncompressed_data_io) + 1
+                src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer))
+                dest_ptr = Ptr{T}(pointer(res, from))
+                # copy content over
+                GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values)
+                to = min(from + num_values - 1, res_len)
+            end
         end
     elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
         # this means the data is encoded in integers format which form the indices to the data
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index 7ad8215..361e133 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -13,6 +13,8 @@ nrows(par)
 
 colnames(par)
 
+@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par)));
+
 using Random: randstring
 tbl = (
     int32 = rand(Int32, 1000),
@@ -27,94 +29,54 @@ tbl = (
     float64m = rand([missing, rand(Float64, 10)...], 1000),
     boolm = rand([missing, true, false], 1000),
     stringm = rand([missing, "abc", "def", "ghi"], 1000)
-)
+);
 
 tmpfile = tempname()*".parquet"
 
-write_parquet(tmpfile, tbl)
+write_parquet(tmpfile, tbl);
 
 path = tmpfile
 
-@time col1 = Parquet.read_column(path, 4)
-
-for i in 1:12
-    @time col1 = Parquet.read_column(path, i);
-end
-
-function read_filep(path, n)
-    collect(Parquet.read_column(path, i) for i in 1:n)
+col_num = 3
+@time col1 = Parquet.read_column(path, col_num);
+col1
+correct = getproperty(tbl, keys(tbl)[col_num])
+all(ismissing.(col1) .== ismissing.(correct))
+all(skipmissing(col1) .== skipmissing(correct))
+
+using Test
+checkcol(col_num) = begin
+    println(col_num)
+    @time col1 = Parquet.read_column(path, col_num);
+    # correct = getproperty(tbl, keys(tbl)[col_num])
+    # @test all(ismissing.(col1) .== ismissing.(correct))
+    # @test all(skipmissing(col1) .== skipmissing(correct))
 end
 
-@time a = read_filep(path, 4);
-
-
-
-
-
-
-
-
-
-@benchmark Parquet.read_column($path, 1)
-
-
+@time checkcol.(1:31)
 
 
 
-
-
-
-
-col_num = 5
-
-filemetadata = Parquet.metadata(path)
-par = ParFile(path)
-fileio = open(path)
-
-T = TYPES[filemetadata.schema[col_num+1]._type+1]
-
-# TODO detect if missing is necessary
-res = Vector{Union{Missing, T}}(missing, nrows(par))
-
-length(filemetadata.row_groups)
-
-from = 1
-last_from = from
-
-row_group = filemetadata.row_groups[1]
-
-colchunk_meta = row_group.columns[col_num].meta_data
-
-if isfilled(colchunk_meta, :dictionary_page_offset)
-    seek(fileio, colchunk_meta.dictionary_page_offset)
-    dict_page_header = read_thrift(fileio, PAR2.PageHeader)
-    compressed_data = read(fileio, dict_page_header.compressed_page_size)
-    uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec)
-    @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size
-
-    if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
-        # see https://github.com/apache/parquet-format/blob/master/Encodings.md#dictionary-encoding-plain_dictionary--2-and-rle_dictionary--8
-        # which is in effect the plain encoding see https://github.com/apache/parquet-format/blob/master/Encodings.md#plain-plain--0
-        dict = reinterpret(T, uncompressed_data)
-    else
-        error("Only Plain Dictionary encoding is supported")
+using Base.Threads: @spawn
+read1(path, n) = begin
+    result = Vector{Any}(undef, length(n))
+    for i in n
+        result[i] = @spawn Parquet.read_column(path, i)
     end
-else
-    dict = nothing
+    fetch.(result)
 end
 
-# seek to the first data page
-seek(fileio, colchunk_meta.data_page_offset)
+@time a = read1(path, 1:5)
 
-pg = read_thrift(fileio, PAR2.PageHeader)
+using DataFrames
 
-Parquet.read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from)
+@time ba=DataFrame(a, copycols=false)
+@time ba=DataFrame(a)
+
+b1
 
-# repeated read data page
-while from - last_from  < row_group.num_rows
-    from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1
-end
-last_from = from
 
+import Base: add_int
+@edit Base.add_int(100, 1)
 
-res
+add_int

From 02836c2b0dd983bb4fcf4f6f54a4bb6e1d418086 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 27 May 2020 15:36:33 +1000
Subject: [PATCH 46/52] more updates

---
 src/column_reader.jl     | 154 +++++++++++++++++++++++----------------
 src/column_reader_dev.jl |  25 +++++--
 src/metadata.jl          |   4 +
 3 files changed, 116 insertions(+), 67 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index 77b6b03..743d42a 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -51,13 +51,15 @@ function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UI
     end
 end
 
-function read_column(path, col_num)
+read_column(path, col_num) = begin
     filemetadata = Parquet.metadata(path)
-    par = ParFile(path)
-    fileio = open(path)
+    read_column(path, filemetadata, col_num)
+end
 
+function read_column(path, filemetadata, col_num)
     T = TYPES[filemetadata.schema[col_num+1]._type+1]
 
+    par = ParFile(path)
     # TODO detect if missing is necessary
     if T == String
         # the memory structure of String is different to other supported types
@@ -66,6 +68,9 @@ function read_column(path, col_num)
     else
         res = Vector{Union{Missing, T}}(undef, nrows(par))
     end
+    close(par)
+
+    fileio = open(path)
 
     from = 1
     last_from = from
@@ -109,7 +114,13 @@ function read_column(path, col_num)
         # repeated read data page
 
         while (from - last_from  < row_group.num_rows) & (from <= length(res))
-            from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from) + 1
+            from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from)
+
+            if from isa Tuple
+                return from
+            else
+                from += 1
+            end
         end
         last_from = from
 
@@ -159,60 +170,83 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
         bitwidth = 1
         encoded_data_len = read(uncompressed_data_io, UInt32)
         pos_before_encoded_data = position(uncompressed_data_io)
-        encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
 
-        # TODO it's possible to be mixing RLE and bitpacked in one algorithm
-        if iseven(encoded_data_header)
-            # RLE encoded
-            rle_len = Int(encoded_data_header >> 1)
-            rle_val = read(uncompressed_data_io, UInt8)
+        from_defn = from
 
-            pos_after_reading_encoded_data = position(uncompressed_data_io)
+        pos_after_reading_encoded_data = pos_before_encoded_data
 
-            if T == String
-                # strings memoery are stored differently so can't benefit from this
-            else
-                # fill the memory location with all missing
-                GC.@preserve res begin
-                    dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1
-                    tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, num_values)
-                    fill!(tmparray, rle_val)
+        # initialise it to something
+        missing_bytes = UInt8[]
+
+        while (pos_after_reading_encoded_data - pos_before_encoded_data) < encoded_data_len
+            encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
+
+            # TODO it's possible to be mixing RLE and bitpacked in one algorithm
+            if iseven(encoded_data_header)
+                # RLE encoded
+                rle_len = Int(encoded_data_header >> 1)
+                rle_val = read(uncompressed_data_io, UInt8)
+
+                pos_after_reading_encoded_data = position(uncompressed_data_io)
+
+                if T == String
+                    # strings memoery are stored differently so can't benefit from this
+                else
+                    # fill the memory location with all missing
+                    GC.@preserve res begin
+                        dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
+                        tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, rle_len)
+                        fill!(tmparray, rle_val)
+                    end
                 end
-            end
-        else
-            # the only reaosn to use bitpacking is because there are missings
-            has_missing = true
 
-            # bitpacked encoded
-            bit_pack_len = Int(encoded_data_header >> 1)
+                from_defn = min(from_defn + rle_len - 1, res_len)
+            else
+                # the only reaosn to use bitpacking is because there are missings
+                has_missing = true
 
-            bytes_to_read = bitwidth*bit_pack_len
-            data = read(uncompressed_data_io, bytes_to_read)
+                # bitpacked encoded
+                bit_pack_len = Int(encoded_data_header >> 1)
 
-            pos_after_reading_encoded_data = position(uncompressed_data_io)
+                bytes_to_read = bitwidth*bit_pack_len
+                data = read(uncompressed_data_io, bytes_to_read)
 
-            # the structure of Vector{Union{T, Missing}} is
-            # * the `values::T` first
-            # * the missing are stored with UInt8(0) for missing
-            # * and UInt8(1) otherwise
-            # see https://docs.julialang.org/en/v1/devdocs/isbitsunionarrays/
+                pos_after_reading_encoded_data = position(uncompressed_data_io)
 
-            # TODO I suspect this is not the fastest way to unpack bitwidth = 1
-            # data
-            @assert bitwidth == 1
-            bp = BitPackedIterator(data, bitwidth)
+                # the structure of Vector{Union{T, Missing}} is
+                # * the `values::T` first
+                # * the missing are stored with UInt8(0) for missing
+                # * and UInt8(1) otherwise
+                # see https://docs.julialang.org/en/v1/devdocs/isbitsunionarrays/
 
-            missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect
+                # TODO I suspect this is not the fastest way to unpack bitwidth = 1
+                # data
+                @assert bitwidth == 1
+                bp = BitPackedIterator(data, bitwidth)
 
-            if T == String
-                # do nothing
-            else
-                GC.@preserve missing_bytes res begin
-                    src_ptr = Ptr{UInt8}(pointer(missing_bytes))
-                    dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from - 1
-                    # copy content over
-                    unsafe_copyto!(dest_ptr, src_ptr, res_len)
+                missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect
+
+                if T == String
+                    # do nothing
+                else
+                    GC.@preserve missing_bytes res begin
+                        if from_defn + length(missing_bytes) - 1 <= res_len
+                            # if not too long then can straight copy
+                            src_ptr = Ptr{UInt8}(pointer(missing_bytes))
+                            dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
+                            # copy content over
+                            unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes))
+                        else
+                            missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(missing_bytes), res_len - from_defn + 1)
+                            src_ptr = Ptr{UInt8}(pointer(missing_bytes_smaller))
+                            dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
+                            # copy content over
+                            unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes_smaller))
+                        end
+                    end
                 end
+
+                from_defn = min(from_defn + length(missing_bytes) - 1, res_len)
             end
         end
     else
@@ -222,12 +256,14 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     # this line ensures that we have read all the encoded definition data
     @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len
 
+
     # read values
+    to = from + num_values - 1
+    @assert to <= res_len
+
     if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN
         # just return the data as is
         if T == Bool
-            to = min(from + num_values - 1, res_len)
-
             if has_missing
                 upto = 1
                 raw_data = Vector{Bool}(undef, 8)
@@ -265,7 +301,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 end
             end
         elseif T == String
-            to = min(from + num_values - 1, res_len)
             if has_missing
                 for (i, missing_byte) in zip(from:to, missing_bytes)
                     if missing_byte == 1
@@ -286,8 +321,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
         else
             if has_missing
                 raw_data = reinterpret(T, read(uncompressed_data_io))
-                to = min(from + num_values - 1, res_len)
-
                 j = 1
                 for (i, missing_byte) in zip(from:to, missing_bytes)
                     if missing_byte == 1
@@ -302,11 +335,12 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 # the copying approach is alot faster than the commented out
                 # assignment approach
                 pos_for_pointer = position(uncompressed_data_io) + 1
-                src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer))
-                dest_ptr = Ptr{T}(pointer(res, from))
-                # copy content over
-                GC.@preserve src_ptr dest_ptr unsafe_copyto!(dest_ptr, src_ptr, num_values)
-                to = min(from + num_values - 1, res_len)
+                GC.@preserve uncompressed_data res begin
+                    src_ptr = Ptr{T}(pointer(uncompressed_data, pos_for_pointer))
+                    dest_ptr = Ptr{T}(pointer(res, from))
+                    # copy content over
+                    unsafe_copyto!(dest_ptr, src_ptr, num_values)
+                end
             end
         end
     elseif data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
@@ -317,7 +351,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
         @assert bitwidth <= 32
 
         while !eof(uncompressed_data_io)
-            # println(position(uncompressed_data_io))
             encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
 
             if iseven(encoded_data_header)
@@ -331,8 +364,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                     rle_val = rle_val | tmp
                 end
 
-                to = min(from + rle_len - 1, res_len)
-                res[from:to] .= dict[rle_val+1]
+                res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1]
 
                 from = from + rle_len
             else
@@ -345,9 +377,8 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 # now need a decoding algorithm to break it up
                 # reading `bitwidth` bits at a time
                 l = length(bp)
-                to = min(from + l - 1, res_len)
 
-                for (v, i) in zip(bp, from:to)
+                for (v, i) in zip(bp, from:min(from + l - 1, to))
                     res[i] = dict[v+1]
                 end
                 from = from + l
@@ -356,6 +387,5 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     else
         erorr("encoding not supported")
     end
-
     to
 end
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index 361e133..2e1d305 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -12,8 +12,9 @@ par = ParFile(path);
 nrows(par)
 
 colnames(par)
+close(par)
 
-@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par)));
+#@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par)));
 
 using Random: randstring
 tbl = (
@@ -35,25 +36,39 @@ tmpfile = tempname()*".parquet"
 
 write_parquet(tmpfile, tbl);
 
+@time adf = read_parquet(tmpfile);
+
+
 path = tmpfile
 
-col_num = 3
+col_num = 5
 @time col1 = Parquet.read_column(path, col_num);
+col1
+
+uncompressed_data_io = col1[1]
+
+encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
+
+using Debugger
+
+filemetadata = Parquet.metadata(path);
+Debugger.@enter Parquet.read_column(path, filemetadata, col_num);
+
 col1
 correct = getproperty(tbl, keys(tbl)[col_num])
 all(ismissing.(col1) .== ismissing.(correct))
 all(skipmissing(col1) .== skipmissing(correct))
 
 using Test
-checkcol(col_num) = begin
+checkcol(path, col_num) = begin
     println(col_num)
-    @time col1 = Parquet.read_column(path, col_num);
+    @elapsed col1 = Parquet.read_column(path, col_num);
     # correct = getproperty(tbl, keys(tbl)[col_num])
     # @test all(ismissing.(col1) .== ismissing.(correct))
     # @test all(skipmissing(col1) .== skipmissing(correct))
 end
 
-@time checkcol.(1:31)
+@time checkcol.(path, 1:31)
 
 
 
diff --git a/src/metadata.jl b/src/metadata.jl
index 1c8c5af..dad19d8 100644
--- a/src/metadata.jl
+++ b/src/metadata.jl
@@ -10,4 +10,8 @@ function metadata(path)
     datasize = sz - meta_len - 2SZ_PAR_MAGIC - SZ_FOOTER
     seek(io, SZ_PAR_MAGIC + datasize)
     filemetadata = read_thrift(io, PAR2.FileMetaData)
+
+    close(io)
+
+    filemetadata
 end

From 070988eba96e1c67ea917d3f3cde9b4709506e22 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 27 May 2020 17:23:09 +1000
Subject: [PATCH 47/52] fixed all bugs

---
 src/column_reader.jl     | 99 ++++++++++++++++++++++++++++++++--------
 src/column_reader_dev.jl | 37 +++++++++++----
 2 files changed, 108 insertions(+), 28 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index 743d42a..2ce5fb7 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -140,8 +140,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     # the result length is used latter on to prevent writing too much data
     res_len = length(res)
 
-    to = from # intialise to something
-
     data_page_header = read_thrift(fileio, PAR2.PageHeader)
     compressed_data = read(fileio, data_page_header.compressed_page_size)
     uncompressed_data = decompress_with_codec(compressed_data, codec)
@@ -200,7 +198,11 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                     end
                 end
 
-                from_defn = min(from_defn + rle_len - 1, res_len)
+                append!(missing_bytes, fill(rle_val, rle_len))
+
+                from_defn += rle_len
+                @assert from_defn - from == length(missing_bytes)
+                @assert length(missing_bytes) <= num_values
             else
                 # the only reaosn to use bitpacking is because there are missings
                 has_missing = true
@@ -224,29 +226,58 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 @assert bitwidth == 1
                 bp = BitPackedIterator(data, bitwidth)
 
-                missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect
+                tmp_missing_bytes::Vector{UInt8} = BitPackedIterator(data, bitwidth) |> collect
+
+                len_of_tmp_missing_bytes = length(tmp_missing_bytes)
+                @assert mod(len_of_tmp_missing_bytes, 8) == 0
+
+                # the tmp_missing_bytes is always in a multiple of 8 so need to
+                # be careful not to write too much
+                last_from_defn = from_defn
+
+                # compute the new from_defn
+                from_defn = min(from_defn + len_of_tmp_missing_bytes, from + num_values)
+
+                len_to_write = from_defn - last_from_defn
 
                 if T == String
                     # do nothing
                 else
-                    GC.@preserve missing_bytes res begin
-                        if from_defn + length(missing_bytes) - 1 <= res_len
+                    GC.@preserve tmp_missing_bytes res begin
+                        if len_to_write == len_of_tmp_missing_bytes
+
+                            append!(missing_bytes, tmp_missing_bytes)
+
+                            # @assert from_defn-from == length(missing_bytes)
+
+                            if length(missing_bytes) > num_values
+                                println(tmp_missing_bytes)
+                                println("$last_from_defn $from_defn $(from+num_values) $len_to_write $len_of_tmp_missing_bytes")
+                            end
+                            # @assert length(missing_bytes) <= num_values
                             # if not too long then can straight copy
-                            src_ptr = Ptr{UInt8}(pointer(missing_bytes))
+                            src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes))
                             dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
                             # copy content over
-                            unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes))
-                        else
-                            missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(missing_bytes), res_len - from_defn + 1)
-                            src_ptr = Ptr{UInt8}(pointer(missing_bytes_smaller))
+                            unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes))
+                        elseif len_to_write < len_of_tmp_missing_bytes
+                            tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write)
+                            # @assert length(tmp_missing_bytes_smaller) == len_to_write
+                            append!(missing_bytes, tmp_missing_bytes_smaller)
+                            # @assert from_defn - from == length(missing_bytes)
+                            # @assert length(missing_bytes) == num_values
+
+                            src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes_smaller))
                             dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
                             # copy content over
-                            unsafe_copyto!(dest_ptr, src_ptr, length(missing_bytes_smaller))
+                            unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes_smaller))
+                        else
+                            error("something is wrong")
                         end
                     end
                 end
-
-                from_defn = min(from_defn + length(missing_bytes) - 1, res_len)
+                # @assert from_defn-from == length(missing_bytes)
+                # @assert length(missing_bytes) <= num_values
             end
         end
     else
@@ -256,12 +287,17 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     # this line ensures that we have read all the encoded definition data
     @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len
 
+    if has_missing
+        @assert length(missing_bytes) == num_values
+    end
+
 
     # read values
     to = from + num_values - 1
     @assert to <= res_len
 
     if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN
+        # println("meh")
         # just return the data as is
         if T == Bool
             if has_missing
@@ -364,7 +400,16 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                     rle_val = rle_val | tmp
                 end
 
-                res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1]
+                if has_missing
+                    index = from:min(to, from + rle_len - 1)
+                    for (i, missing_byte) in zip(index, missing_bytes)
+                        if missing_byte == 1
+                            res[i] = dict[rle_val+1]
+                        end
+                    end
+                else
+                    res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1]
+                end
 
                 from = from + rle_len
             else
@@ -373,19 +418,35 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 @assert (bit_pack_len >= 1) && (bit_pack_len <= 2^31 - 1)
                 bytes_to_read = bitwidth*bit_pack_len
                 data = read(uncompressed_data_io, bytes_to_read)
-                bp = BitPackedIterator(data, bitwidth)
+                # TODO remove the collect here
+                bp = BitPackedIterator(data, bitwidth) |> collect
                 # now need a decoding algorithm to break it up
                 # reading `bitwidth` bits at a time
                 l = length(bp)
 
-                for (v, i) in zip(bp, from:min(from + l - 1, to))
-                    res[i] = dict[v+1]
+                index = from:min(from + l - 1, to)
+
+                if has_missing
+                    j = 1
+                    for (i, missing_byte) in zip(index, missing_bytes)
+                        if missing_byte == 1
+                            res[i] = dict[bp[j]+1]
+                            j += 1
+                        end
+                    end
+                else
+                    for (i, v) in zip(index, bp)
+                        res[i] = dict[v+1]
+                    end
                 end
+
+
                 from = from + l
             end
         end
     else
         erorr("encoding not supported")
     end
-    to
+
+    return to
 end
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index 2e1d305..f1d8dee 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -6,6 +6,10 @@ using Snappy, CodecZlib, CodecZstd
 path = "c:/git/parquet-data-collection/dsd50p.parquet"
 path = "c:/data/Performance_2003Q3.txt.parquet"
 
+col_num = 1
+@time col1 = Parquet.read_column(path, col_num);
+col1
+
 meta = Parquet.metadata(path);
 par = ParFile(path);
 
@@ -41,10 +45,14 @@ write_parquet(tmpfile, tbl);
 
 path = tmpfile
 
-col_num = 5
-@time col1 = Parquet.read_column(path, col_num);
+
+
 col1
 
+col1[19:20]
+
+last(col1)
+
 uncompressed_data_io = col1[1]
 
 encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
@@ -60,15 +68,26 @@ all(ismissing.(col1) .== ismissing.(correct))
 all(skipmissing(col1) .== skipmissing(correct))
 
 using Test
-checkcol(path, col_num) = begin
-    println(col_num)
-    @elapsed col1 = Parquet.read_column(path, col_num);
-    # correct = getproperty(tbl, keys(tbl)[col_num])
-    # @test all(ismissing.(col1) .== ismissing.(correct))
-    # @test all(skipmissing(col1) .== skipmissing(correct))
+using Base.Threads: @spawn
+
+checkcol(path, n; multithreaded=true) = begin
+    res = Vector{Any}(undef, n)
+    if multithreaded
+        for col_num in 1:n
+            res[col_num] = @spawn Parquet.read_column(path, col_num);
+        end
+        return fetch.(res)
+    else
+        for col_num in 1:n
+            println(col_num)
+            res[col_num] = Parquet.read_column(path, col_num);
+        end
+        return res
+    end
 end
 
-@time checkcol.(path, 1:31)
+@time checkcol(path, 31, multithreaded=true);
+@time checkcol(path, 31, multithreaded=false);
 
 
 

From 08a961bd29920b75843ff1340164993c523c285a Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 27 May 2020 19:23:45 +1000
Subject: [PATCH 48/52] fixed memory bug

---
 src/column_reader.jl     | 54 +++++++++++++++++++---------------------
 src/column_reader_dev.jl | 52 +++++++++++++++++++++++---------------
 2 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index 2ce5fb7..1f67e05 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -233,51 +233,44 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
 
                 # the tmp_missing_bytes is always in a multiple of 8 so need to
                 # be careful not to write too much
-                last_from_defn = from_defn
-
                 # compute the new from_defn
-                from_defn = min(from_defn + len_of_tmp_missing_bytes, from + num_values)
+                new_from_defn = min(from_defn + len_of_tmp_missing_bytes, from + num_values)
+
+                len_to_write = new_from_defn - from_defn
 
-                len_to_write = from_defn - last_from_defn
+                if len_to_write == len_of_tmp_missing_bytes
+                    append!(missing_bytes, tmp_missing_bytes)
+                elseif len_to_write < len_of_tmp_missing_bytes
+                    tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write)
+                    append!(missing_bytes, tmp_missing_bytes_smaller)
+                else
+                    error("something is wrong")
+                end
 
                 if T == String
                     # do nothing
                 else
-                    GC.@preserve tmp_missing_bytes res begin
-                        if len_to_write == len_of_tmp_missing_bytes
-
-                            append!(missing_bytes, tmp_missing_bytes)
-
-                            # @assert from_defn-from == length(missing_bytes)
-
-                            if length(missing_bytes) > num_values
-                                println(tmp_missing_bytes)
-                                println("$last_from_defn $from_defn $(from+num_values) $len_to_write $len_of_tmp_missing_bytes")
-                            end
-                            # @assert length(missing_bytes) <= num_values
+                    if len_to_write == len_of_tmp_missing_bytes
+                        GC.@preserve tmp_missing_bytes res begin
                             # if not too long then can straight copy
                             src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes))
                             dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
                             # copy content over
                             unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes))
-                        elseif len_to_write < len_of_tmp_missing_bytes
-                            tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write)
-                            # @assert length(tmp_missing_bytes_smaller) == len_to_write
-                            append!(missing_bytes, tmp_missing_bytes_smaller)
-                            # @assert from_defn - from == length(missing_bytes)
-                            # @assert length(missing_bytes) == num_values
-
+                        end
+                    elseif len_to_write < len_of_tmp_missing_bytes
+                        GC.@preserve tmp_missing_bytes_smaller res begin
                             src_ptr = Ptr{UInt8}(pointer(tmp_missing_bytes_smaller))
                             dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
                             # copy content over
-                            unsafe_copyto!(dest_ptr, src_ptr, length(tmp_missing_bytes_smaller))
-                        else
-                            error("something is wrong")
+                            unsafe_copyto!(dest_ptr, src_ptr, len_to_write)
                         end
+                    else
+                        error("something is wrong")
                     end
+
                 end
-                # @assert from_defn-from == length(missing_bytes)
-                # @assert length(missing_bytes) <= num_values
+                from_defn = new_from_defn
             end
         end
     else
@@ -320,7 +313,9 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 i = from
                 while !eof(uncompressed_data_io)
                     udi = read(uncompressed_data_io, UInt8)
-                    raw_data = Base.unsafe_wrap(Vector{Bool}, pointer(res, i) |>  Ptr{Bool}, (8,))
+                    GC.@preserve res begin
+                        raw_data = Base.unsafe_wrap(Vector{Bool}, pointer(res, i) |>  Ptr{Bool}, (8,))
+                    end
                     digits!(raw_data, udi, base=2)
 
                     if i + 8 - 1 <= res_len
@@ -357,6 +352,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
         else
             if has_missing
                 raw_data = reinterpret(T, read(uncompressed_data_io))
+                return raw_data, missing_bytes
                 j = 1
                 for (i, missing_byte) in zip(from:to, missing_bytes)
                     if missing_byte == 1
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index f1d8dee..e42a673 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -3,23 +3,6 @@ using Parquet:TYPES, read_thrift, PAR2, BitPackedIterator, decompress_with_codec
 using Thrift: isfilled
 using Snappy, CodecZlib, CodecZstd
 
-path = "c:/git/parquet-data-collection/dsd50p.parquet"
-path = "c:/data/Performance_2003Q3.txt.parquet"
-
-col_num = 1
-@time col1 = Parquet.read_column(path, col_num);
-col1
-
-meta = Parquet.metadata(path);
-par = ParFile(path);
-
-nrows(par)
-
-colnames(par)
-close(par)
-
-#@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par)));
-
 using Random: randstring
 tbl = (
     int32 = rand(Int32, 1000),
@@ -38,15 +21,42 @@ tbl = (
 
 tmpfile = tempname()*".parquet"
 
-write_parquet(tmpfile, tbl);
+@time write_parquet(tmpfile, tbl);
+path = tmpfile
 
-@time adf = read_parquet(tmpfile);
+col_num=12
+@time col1 = Parquet.read_column(path, col_num);
+all(col1 .=== tbl.stringm)
+
+
+using BenchmarkTools
+@benchmark adf = read_parquet(path)
 
 
-path = tmpfile
 
 
 
+
+
+
+path = "c:/git/parquet-data-collection/dsd50p.parquet"
+path = "c:/data/Performance_2003Q3.txt.parquet"
+
+col_num = 1
+@time col1 = Parquet.read_column(path, col_num);
+col1
+
+meta = Parquet.metadata(path);
+par = ParFile(path);
+
+nrows(par)
+
+colnames(par)
+close(par)
+
+#@time tbl = Parquet.read_column.(Ref(path), 1:length(colnames(par)));
+
+
 col1
 
 col1[19:20]
@@ -89,6 +99,8 @@ end
 @time checkcol(path, 31, multithreaded=true);
 @time checkcol(path, 31, multithreaded=false);
 
+@time checkcol(path, 12, multithreaded=false);
+
 
 
 using Base.Threads: @spawn

From dc619e3904d10d54b3dc19f45e05410a36f7a35f Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 27 May 2020 19:39:28 +1000
Subject: [PATCH 49/52] fixed bug with parquet reader

---
 src/column_reader_dev.jl |   6 +-
 src/read_parquet.jl      |  33 ++--
 src/show.jl              | 417 ++++++++++++++++++++-------------------
 3 files changed, 232 insertions(+), 224 deletions(-)

diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index e42a673..34e9a58 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -28,12 +28,13 @@ col_num=12
 @time col1 = Parquet.read_column(path, col_num);
 all(col1 .=== tbl.stringm)
 
+a = read_parquet(path)
 
 using BenchmarkTools
 @benchmark adf = read_parquet(path)
 
 
-
+adf
 
 
 
@@ -42,6 +43,9 @@ using BenchmarkTools
 path = "c:/git/parquet-data-collection/dsd50p.parquet"
 path = "c:/data/Performance_2003Q3.txt.parquet"
 
+@time adf = read_parquet(path);
+
+
 col_num = 1
 @time col1 = Parquet.read_column(path, col_num);
 col1
diff --git a/src/read_parquet.jl b/src/read_parquet.jl
index 4916a6c..de953a9 100644
--- a/src/read_parquet.jl
+++ b/src/read_parquet.jl
@@ -10,15 +10,12 @@ read_parquet(path; kwargs...) = read_parquet(path, String[]; kwargs...)
 function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose = false)
 	"""function for reading parquet"""
 
-	if multithreaded
-		# use a bounded channel to limit
-	    c1 = Channel{Bool}(Threads.nthreads())
-	    atexit(()->close(c1))
-	end
+    par = ParFile(path)
+	nc = ncols(par)
 
-	nc = ncols(ParFile(path))
+    colnames = [sch.name for sch in  drop(par.schema.schema, 1)]
 
-	colnames = [sch.name for sch in  drop(ParFile(path).schema.schema, 1)]
+    close(par)
 
 	if length(cols) == 0
 		colnums = collect(1:nc)
@@ -31,13 +28,8 @@ function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose =
 	filemetadata = metadata(path)
 
 	if multithreaded
-		@showprogress for (i, j) in enumerate(colnums)
-			put!(c1, true)
-			results[i] = @spawn begin
-				res = read_column(path, filemetadata, j)
-				take!(c1)
-				res
-			end
+		for (i, j) in enumerate(colnums)
+			results[i] = @spawn read_column(path, filemetadata, j)
 		end
 	else
 		@showprogress for (i, j) in enumerate(colnums)
@@ -47,10 +39,11 @@ function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose =
 
 	symbol_col_names = collect(Symbol(col) for col in colnames[colnums])
 
-	if multithreaded
-		fnl_results = collect(fetch(result) for result in results)
-		return namedtuple(symbol_col_names, fnl_results)
-	else
-		return namedtuple(symbol_col_names, results)
-	end
+    if multithreaded
+        @showprogress for i in 1:length(results)
+            results[i] = fetch(results[i])
+        end
+    end
+
+    return namedtuple(symbol_col_names, results)
 end
diff --git a/src/show.jl b/src/show.jl
index 27ef44a..fcd17a9 100644
--- a/src/show.jl
+++ b/src/show.jl
@@ -1,203 +1,214 @@
-function print_indent(io, n)
-    for d in 1:n
-        print(io, "  ")
-    end
-end
-
-function show(io::IO, cursor::RecordCursor)
-    par = cursor.par
-    rows = cursor.colcursors[1].row.rows
-    println(io, "Record Cursor on $(par.path)")
-    println(io, "    rows: $rows")
-
-    colpaths = [join(colname, '.') for colname in cursor.colnames]
-    println(io, "    cols: $(join(colpaths, ", "))")
-end
-
-function show(io::IO, schema::SchemaElement, indent::AbstractString="", nchildren::Vector{Int}=Int[])
-    print(io, indent)
-    lchildren = length(nchildren)
-    print_indent(io, lchildren)
-    if isfilled(schema, :repetition_type)
-        r = schema.repetition_type
-        print(io, (r == FieldRepetitionType.REQUIRED) ? "required" : (r == FieldRepetitionType.OPTIONAL) ? "optional" : "repeated", " ");
-    end
-    isfilled(schema, :_type) && print(io, Thrift.enumstr(_Type, schema._type), " ")
-
-    print(io, schema.name)
-    isfilled(schema, :field_id) && print(io, " (", schema.field_id, ")")
-
-    if isfilled(schema, :converted_type)
-        print(io, "# (from ", Thrift.enumstr(ConvertedType, schema.converted_type))
-        if schema.converted_type == ConvertedType.DECIMAL
-            print(io, "(", schema.scale, ".", schema.precision)
-        end
-        print(") ")
-    end
-
-    if isfilled(schema, :num_children)
-        push!(nchildren, schema.num_children)
-        print(io, " {")
-    elseif lchildren > 0
-        nchildren[lchildren] -= 1
-        if nchildren[lchildren] == 0
-            pop!(nchildren)
-            println(io, "")
-            print_indent(io, length(nchildren))
-            print(io, indent, "}")
-        end
-    end
-
-    println(io, "")
-end
-
-function show(io::IO, schema::Vector{SchemaElement}, indent::AbstractString="")
-    println(io, indent, "Schema:")
-    nchildren=Int[]
-    for schemaelem in schema
-        show(io, schemaelem, indent * "    ", nchildren)
-    end
-end
-
-show(io::IO, schema::Schema, indent::AbstractString="") = show(io, schema.schema, indent)
-
-function show(io::IO, kvmeta::KeyValue, indent::AbstractString="")
-    println(io, indent, kvmeta.key, " => ", kvmeta.value)
-end
-
-function show(io::IO, kvmetas::Vector{KeyValue}, indent::AbstractString="")
-    isempty(kvmetas) && return
-    println(io, indent, "Metadata:")
-    for kvmeta in kvmetas
-        show(io, kvmeta, indent * "  ")
-    end
-end
-
-function show_encodings(io::IO, encodings::Vector{Int32}, indent::AbstractString="")
-    isempty(encodings) && return
-    print(io, indent, "Encodings: ")
-    pfx = ""
-    for encoding in encodings
-        print(io, pfx, Thrift.enumstr(Encoding, encoding))
-        pfx = ", "
-    end
-    println(io, "")
-end
-
-show(io::IO, hdr::IndexPageHeader, indent::AbstractString="") = nothing
-function show(io::IO, page::DictionaryPageHeader, indent::AbstractString="")
-    println(io, indent, page.num_values, " values")
-end
-
-function show(io::IO, hdr::DataPageHeader, indent::AbstractString="")
-    println(io, indent, hdr.num_values, " values")
-    println(io, indent, "encodings: values as ", Thrift.enumstr(Encoding, hdr.encoding), ", definitions as ", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetitions as ", Thrift.enumstr(Encoding, hdr.repetition_level_encoding))
-    Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent)
-end
-
-function show(io::IO, hdr::DataPageHeaderV2, indent::AbstractString="")
-    compressed = Thrift.isfilled(hdr, :is_compressed) ? hdr.is_compressed : true
-    println(io, indent, hdr.num_values, " values, ", hdr.num_nulls, " nulls, ", hdr.num_rows, " rows, compressed:", compressed)
-    println(io, indent, "encoding:", Thrift.enumstr(Encoding, hdr.encoding), ", definition:", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetition:", Thrift.enumstr(Encoding, hdr.repetition_level_encoding))
-    Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent)
-end
-
-function show(io::IO, page::PageHeader, indent::AbstractString="")
-    println(io, indent, Thrift.enumstr(PageType, page._type), " compressed bytes:", page.compressed_page_size, " (", page.uncompressed_page_size, " uncompressed)")
-    Thrift.isfilled(page, :data_page_header) && show(io, page.data_page_header, indent * "  ")
-    Thrift.isfilled(page, :data_page_header_v2) && show(io, page.data_page_header_v2, indent * "  ")
-    Thrift.isfilled(page, :index_page_header) && show(io, page.index_page_header, indent * "  ")
-    Thrift.isfilled(page, :dictionary_page_header) && show(io, page.dictionary_page_header, indent * "  ")
-end
-
-function show(io::IO, pages::Vector{PageHeader}, indent::AbstractString="")
-    println(io, indent, "Pages:")
-    for page in pages
-        show(io, page, indent * "  ")
-    end
-end
-
-show(io::IO, page::Page, indent::AbstractString="") = show(io, page.hdr, indent)
-show(io::IO, pages::Vector{Page}, indent::AbstractString="") = show(io, [page.hdr for page in pages], indent)
-
-function show(io::IO, stat::Statistics, indent::AbstractString="")
-    println(io, indent, "Statistics:")
-    if Thrift.isfilled(stat, :min) && Thrift.isfilled(stat, :max)
-        println(io, indent, "  range:", stat.min, ":", stat.max)
-    elseif Thrift.isfilled(stat, :min)
-        println(io, indent, "  min:", stat.min)
-    elseif Thrift.isfilled(stat, :max)
-        println(io, indent, "  max:", stat.max)
-    end
-    Thrift.isfilled(stat, :null_count) && println(io, indent, "  null count:", stat.null_count)
-    Thrift.isfilled(stat, :distinct_count) && println(io, indent, "  distinct count:", stat.distinct_count)
-end
-
-function show(io::IO, page_enc::PageEncodingStats, indent::AbstractString="")
-    println(io, indent, page_enc.count, " ", Thrift.enumstr(Encoding, page_enc.encoding), " encoded ", Thrift.enumstr(PageType, page_enc.page_type), " pages")
-end
-
-function show(io::IO, page_encs::Vector{PageEncodingStats}, indent::AbstractString="")
-    isempty(page_encs) && return
-    println(io, indent, "Page encoding statistics:")
-    for page_enc in page_encs
-        show(io, page_enc, indent * "  ")
-    end
-end
-
-function show(io::IO, colmeta::ColumnMetaData, indent::AbstractString="")
-    println(io, indent, Thrift.enumstr(_Type, coltype(colmeta)), " ", join(colname(colmeta), '.'), ", num values:", colmeta.num_values)
-    show_encodings(io, colmeta.encodings, indent)
-    if colmeta.codec != CompressionCodec.UNCOMPRESSED
-        println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " compressed bytes:", colmeta.total_compressed_size, " (", colmeta.total_uncompressed_size, " uncompressed)")
-    else
-        println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " bytes:", colmeta.total_compressed_size)
-    end
-
-    print(io, indent, "offsets: data:", colmeta.data_page_offset)
-    Thrift.isfilled(colmeta, :index_page_offset) && print(io, ", index:", colmeta.index_page_offset)
-    Thrift.isfilled(colmeta, :dictionary_page_offset) && print(io, ", dictionary:", colmeta.dictionary_page_offset)
-    println(io, "")
-    Thrift.isfilled(colmeta, :statistics) && show(io, colmeta.statistics, indent)
-    Thrift.isfilled(colmeta, :encoding_stats) && show(io, colmeta.encoding_stats, indent)
-    Thrift.isfilled(colmeta, :key_value_metadata) && show(io, colmeta.key_value_metadata, indent)
-end
-
-function show(io::IO, columns::Vector{ColumnChunk}, indent::AbstractString="")
-    for col in columns
-        path = isfilled(col, :file_path) ? col.file_path : ""
-        println(io, indent, "Column at offset: ", path, "#", col.file_offset)
-        show(io, col.meta_data, indent * "  ")
-    end
-end
-
-function show(io::IO, grp::RowGroup, indent::AbstractString="")
-    println(io, indent, "Row Group: ", grp.num_rows, " rows in ", grp.total_byte_size, " bytes")
-    show(io, grp.columns, indent * "  ")
-end
-
-function show(io::IO, row_groups::Vector{RowGroup}, indent::AbstractString="")
-    println(io, indent, "Row Groups:")
-    for grp in row_groups
-        show(io, grp, indent * "  ")
-    end
-end
-
-function show(io::IO, meta::FileMetaData, indent::AbstractString="")
-    println(io, indent, "version: ", meta.version)
-    println(io, indent, "nrows: ", meta.num_rows)
-    println(io, indent, "created by: ", meta.created_by)
-
-    show(io, meta.schema, indent)
-    show(io, meta.row_groups, indent)
-    Thrift.isfilled(meta, :key_value_metadata) && show(io, meta.key_value_metadata, indent)
-end
-
-function show(io::IO, par::ParFile)
-    println(io, "Parquet file: $(par.path)")
-    meta = par.meta
-    println(io, "    version: $(meta.version)")
-    println(io, "    nrows: $(meta.num_rows)")
-    println(io, "    created by: $(meta.created_by)")
-    println(io, "    cached: $(length(par.page_cache.refs)) column chunks")
-end
+# function print_indent(io, n)
+#     for d in 1:n
+#         print(io, "  ")
+#     end
+# end
+#
+# function show(io::IO, cursor::RecordCursor)
+#     par = cursor.par
+#     rows = cursor.colcursors[1].row.rows
+#     println(io, "Record Cursor on $(par.path)")
+#     println(io, "    rows: $rows")
+#
+#     colpaths = [join(colname, '.') for colname in cursor.colnames]
+#     println(io, "    cols: $(join(colpaths, ", "))")
+# end
+#
+# function show(io::IO, cursor::BatchedColumnsCursor)
+#     par = cursor.par
+#     rows = cursor.colcursors[1].row.rows
+#     println(io, "Batched Columns Cursor on $(par.path)")
+#     println(io, "    rows: $rows")
+#     println(io, "    batches: $(length(cursor))")
+#
+#     colpaths = [join(colname, '.') for colname in cursor.colnames]
+#     println(io, "    cols: $(join(colpaths, ", "))")
+# end
+#
+# function show(io::IO, schema::SchemaElement, indent::AbstractString="", nchildren::Vector{Int}=Int[])
+#     print(io, indent)
+#     lchildren = length(nchildren)
+#     print_indent(io, lchildren)
+#     if isfilled(schema, :repetition_type)
+#         r = schema.repetition_type
+#         print(io, (r == FieldRepetitionType.REQUIRED) ? "required" : (r == FieldRepetitionType.OPTIONAL) ? "optional" : "repeated", " ");
+#     end
+#     isfilled(schema, :_type) && print(io, Thrift.enumstr(_Type, schema._type), " ")
+#
+#     print(io, schema.name)
+#     isfilled(schema, :field_id) && print(io, " (", schema.field_id, ")")
+#
+#     if isfilled(schema, :converted_type)
+#         print(io, "# (from ", Thrift.enumstr(ConvertedType, schema.converted_type))
+#         if schema.converted_type == ConvertedType.DECIMAL
+#             print(io, "(", schema.scale, ".", schema.precision)
+#         end
+#         print(") ")
+#     end
+#
+#     if isfilled(schema, :num_children)
+#         push!(nchildren, schema.num_children)
+#         print(io, " {")
+#     elseif lchildren > 0
+#         nchildren[lchildren] -= 1
+#         if nchildren[lchildren] == 0
+#             pop!(nchildren)
+#             println(io, "")
+#             print_indent(io, length(nchildren))
+#             print(io, indent, "}")
+#         end
+#     end
+#
+#     println(io, "")
+# end
+#
+# function show(io::IO, schema::Vector{SchemaElement}, indent::AbstractString="")
+#     println(io, indent, "Schema:")
+#     nchildren=Int[]
+#     for schemaelem in schema
+#         show(io, schemaelem, indent * "    ", nchildren)
+#     end
+# end
+#
+# show(io::IO, schema::Schema, indent::AbstractString="") = show(io, schema.schema, indent)
+#
+# function show(io::IO, kvmeta::KeyValue, indent::AbstractString="")
+#     println(io, indent, kvmeta.key, " => ", kvmeta.value)
+# end
+#
+# function show(io::IO, kvmetas::Vector{KeyValue}, indent::AbstractString="")
+#     isempty(kvmetas) && return
+#     println(io, indent, "Metadata:")
+#     for kvmeta in kvmetas
+#         show(io, kvmeta, indent * "  ")
+#     end
+# end
+#
+# function show_encodings(io::IO, encodings::Vector{Int32}, indent::AbstractString="")
+#     isempty(encodings) && return
+#     print(io, indent, "Encodings: ")
+#     pfx = ""
+#     for encoding in encodings
+#         print(io, pfx, Thrift.enumstr(Encoding, encoding))
+#         pfx = ", "
+#     end
+#     println(io, "")
+# end
+#
+# show(io::IO, hdr::IndexPageHeader, indent::AbstractString="") = nothing
+# function show(io::IO, page::DictionaryPageHeader, indent::AbstractString="")
+#     println(io, indent, page.num_values, " values")
+# end
+#
+# function show(io::IO, hdr::DataPageHeader, indent::AbstractString="")
+#     println(io, indent, hdr.num_values, " values")
+#     println(io, indent, "encodings: values as ", Thrift.enumstr(Encoding, hdr.encoding), ", definitions as ", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetitions as ", Thrift.enumstr(Encoding, hdr.repetition_level_encoding))
+#     Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent)
+# end
+#
+# function show(io::IO, hdr::DataPageHeaderV2, indent::AbstractString="")
+#     compressed = Thrift.isfilled(hdr, :is_compressed) ? hdr.is_compressed : true
+#     println(io, indent, hdr.num_values, " values, ", hdr.num_nulls, " nulls, ", hdr.num_rows, " rows, compressed:", compressed)
+#     println(io, indent, "encoding:", Thrift.enumstr(Encoding, hdr.encoding), ", definition:", Thrift.enumstr(Encoding, hdr.definition_level_encoding), ", repetition:", Thrift.enumstr(Encoding, hdr.repetition_level_encoding))
+#     Thrift.isfilled(hdr, :statistics) && show(io, hdr.statistics, indent)
+# end
+#
+# function show(io::IO, page::PageHeader, indent::AbstractString="")
+#     println(io, indent, Thrift.enumstr(PageType, page._type), " compressed bytes:", page.compressed_page_size, " (", page.uncompressed_page_size, " uncompressed)")
+#     Thrift.isfilled(page, :data_page_header) && show(io, page.data_page_header, indent * "  ")
+#     Thrift.isfilled(page, :data_page_header_v2) && show(io, page.data_page_header_v2, indent * "  ")
+#     Thrift.isfilled(page, :index_page_header) && show(io, page.index_page_header, indent * "  ")
+#     Thrift.isfilled(page, :dictionary_page_header) && show(io, page.dictionary_page_header, indent * "  ")
+# end
+#
+# function show(io::IO, pages::Vector{PageHeader}, indent::AbstractString="")
+#     println(io, indent, "Pages:")
+#     for page in pages
+#         show(io, page, indent * "  ")
+#     end
+# end
+#
+# show(io::IO, page::Page, indent::AbstractString="") = show(io, page.hdr, indent)
+# show(io::IO, pages::Vector{Page}, indent::AbstractString="") = show(io, [page.hdr for page in pages], indent)
+#
+# function show(io::IO, stat::Statistics, indent::AbstractString="")
+#     println(io, indent, "Statistics:")
+#     if Thrift.isfilled(stat, :min) && Thrift.isfilled(stat, :max)
+#         println(io, indent, "  range:", stat.min, ":", stat.max)
+#     elseif Thrift.isfilled(stat, :min)
+#         println(io, indent, "  min:", stat.min)
+#     elseif Thrift.isfilled(stat, :max)
+#         println(io, indent, "  max:", stat.max)
+#     end
+#     Thrift.isfilled(stat, :null_count) && println(io, indent, "  null count:", stat.null_count)
+#     Thrift.isfilled(stat, :distinct_count) && println(io, indent, "  distinct count:", stat.distinct_count)
+# end
+#
+# function show(io::IO, page_enc::PageEncodingStats, indent::AbstractString="")
+#     println(io, indent, page_enc.count, " ", Thrift.enumstr(Encoding, page_enc.encoding), " encoded ", Thrift.enumstr(PageType, page_enc.page_type), " pages")
+# end
+#
+# function show(io::IO, page_encs::Vector{PageEncodingStats}, indent::AbstractString="")
+#     isempty(page_encs) && return
+#     println(io, indent, "Page encoding statistics:")
+#     for page_enc in page_encs
+#         show(io, page_enc, indent * "  ")
+#     end
+# end
+#
+# function show(io::IO, colmeta::ColumnMetaData, indent::AbstractString="")
+#     println(io, indent, Thrift.enumstr(_Type, coltype(colmeta)), " ", join(colname(colmeta), '.'), ", num values:", colmeta.num_values)
+#     show_encodings(io, colmeta.encodings, indent)
+#     if colmeta.codec != CompressionCodec.UNCOMPRESSED
+#         println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " compressed bytes:", colmeta.total_compressed_size, " (", colmeta.total_uncompressed_size, " uncompressed)")
+#     else
+#         println(io, indent, Thrift.enumstr(CompressionCodec, colmeta.codec), " bytes:", colmeta.total_compressed_size)
+#     end
+#
+#     print(io, indent, "offsets: data:", colmeta.data_page_offset)
+#     Thrift.isfilled(colmeta, :index_page_offset) && print(io, ", index:", colmeta.index_page_offset)
+#     Thrift.isfilled(colmeta, :dictionary_page_offset) && print(io, ", dictionary:", colmeta.dictionary_page_offset)
+#     println(io, "")
+#     Thrift.isfilled(colmeta, :statistics) && show(io, colmeta.statistics, indent)
+#     Thrift.isfilled(colmeta, :encoding_stats) && show(io, colmeta.encoding_stats, indent)
+#     Thrift.isfilled(colmeta, :key_value_metadata) && show(io, colmeta.key_value_metadata, indent)
+# end
+#
+# function show(io::IO, columns::Vector{ColumnChunk}, indent::AbstractString="")
+#     for col in columns
+#         path = isfilled(col, :file_path) ? col.file_path : ""
+#         println(io, indent, "Column at offset: ", path, "#", col.file_offset)
+#         show(io, col.meta_data, indent * "  ")
+#     end
+# end
+#
+# function show(io::IO, grp::RowGroup, indent::AbstractString="")
+#     println(io, indent, "Row Group: ", grp.num_rows, " rows in ", grp.total_byte_size, " bytes")
+#     show(io, grp.columns, indent * "  ")
+# end
+#
+# function show(io::IO, row_groups::Vector{RowGroup}, indent::AbstractString="")
+#     println(io, indent, "Row Groups:")
+#     for grp in row_groups
+#         show(io, grp, indent * "  ")
+#     end
+# end
+#
+# function show(io::IO, meta::FileMetaData, indent::AbstractString="")
+#     println(io, indent, "version: ", meta.version)
+#     println(io, indent, "nrows: ", meta.num_rows)
+#     println(io, indent, "created by: ", meta.created_by)
+#
+#     show(io, meta.schema, indent)
+#     show(io, meta.row_groups, indent)
+#     Thrift.isfilled(meta, :key_value_metadata) && show(io, meta.key_value_metadata, indent)
+# end
+#
+# function show(io::IO, par::ParFile)
+#     println(io, "Parquet file: $(par.path)")
+#     meta = par.meta
+#     println(io, "    version: $(meta.version)")
+#     println(io, "    nrows: $(meta.num_rows)")
+#     println(io, "    created by: $(meta.created_by)")
+#     println(io, "    cached: $(length(par.page_cache.refs)) column chunks")
+# end

From 9f50dad013d453730aaed62c0d009ca73ed877aa Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Wed, 27 May 2020 19:43:01 +1000
Subject: [PATCH 50/52] minor bug fix

---
 src/column_reader.jl     | 1 -
 src/column_reader_dev.jl | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index 1f67e05..f54a6bb 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -352,7 +352,6 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
         else
             if has_missing
                 raw_data = reinterpret(T, read(uncompressed_data_io))
-                return raw_data, missing_bytes
                 j = 1
                 for (i, missing_byte) in zip(from:to, missing_bytes)
                     if missing_byte == 1
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index 34e9a58..bcbf2d9 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -45,6 +45,8 @@ path = "c:/data/Performance_2003Q3.txt.parquet"
 
 @time adf = read_parquet(path);
 
+adf.V5
+
 
 col_num = 1
 @time col1 = Parquet.read_column(path, col_num);

From 0c81da98a7686ba13176cb0723ac1e35349232d3 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Fri, 29 May 2020 14:30:13 +1000
Subject: [PATCH 51/52] before operating on misssing bytes

---
 src/column_reader.jl     | 68 +++++++++++++++++++++++++++++-----------
 src/column_reader_dev.jl | 43 +++++++++++++------------
 src/read_parquet.jl      | 33 ++++++++++---------
 3 files changed, 88 insertions(+), 56 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index f54a6bb..34da73a 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -70,8 +70,23 @@ function read_column(path, filemetadata, col_num)
     end
     close(par)
 
+
     fileio = open(path)
 
+    # I thnk there is a bug with Julia's multithreaded reads
+    # which can be fixed by doing the below
+    # DO NOT remove the code below or multithreading will fail
+    println("$(position(fileio))")
+    if true
+        not_used = open(tempname()*string(col_num), "w")
+        write(not_used, position(fileio))
+        close(not_used)
+    end
+
+    # to reduce allocations we make a compressed_data array to store compressed data
+    compressed_data_buffer = Vector{UInt8}(undef, 100)
+    compressed_data = UInt8[] # initialise it
+
     from = 1
     last_from = from
 
@@ -82,7 +97,14 @@ function read_column(path, filemetadata, col_num)
         if isfilled(colchunk_meta, :dictionary_page_offset)
             seek(fileio, colchunk_meta.dictionary_page_offset)
             dict_page_header = read_thrift(fileio, PAR2.PageHeader)
-            compressed_data = read(fileio, dict_page_header.compressed_page_size)
+
+            # use the
+            readbytes!(fileio, compressed_data_buffer, dict_page_header.compressed_page_size)
+            GC.@preserve compressed_data_buffer begin
+                compressed_data = unsafe_wrap(Vector{UInt8}, pointer(compressed_data_buffer), dict_page_header.compressed_page_size)
+            end
+            # compressed_data = read(fileio, dict_page_header.compressed_page_size)
+
             uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec)
             @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size
 
@@ -100,6 +122,10 @@ function read_column(path, filemetadata, col_num)
                     end
                 else
                     dict = reinterpret(T, uncompressed_data)
+                    # nvals = dict_page_header.dictionary_page_header.num_values
+                    # GC.@preserve uncompressed_data begin
+                    #     dict = unsafe_wrap(Vector{T}, Ptr{T}(pointer(uncompressed_data)), nvals)
+                    # end
                 end
             else
                 error("Only Plain Dictionary encoding is supported")
@@ -111,8 +137,8 @@ function read_column(path, filemetadata, col_num)
         # seek to the first data page
         seek(fileio, colchunk_meta.data_page_offset)
 
-        # repeated read data page
 
+        # repeated read data page
         while (from - last_from  < row_group.num_rows) & (from <= length(res))
             from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from)
 
@@ -124,7 +150,7 @@ function read_column(path, filemetadata, col_num)
         end
         last_from = from
 
-        # (j == 1) && return res
+        # (j == 2) && return res
         j += 1
 
     end
@@ -141,15 +167,22 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     res_len = length(res)
 
     data_page_header = read_thrift(fileio, PAR2.PageHeader)
-    compressed_data = read(fileio, data_page_header.compressed_page_size)
-    uncompressed_data = decompress_with_codec(compressed_data, codec)
+
+    #compressed_data = read(fileio, data_page_header.compressed_page_size)
+    compressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.compressed_page_size*1.5))
+
+    readbytes!(fileio, compressed_data_buffer, data_page_header.compressed_page_size)
+    GC.@preserve compressed_data_buffer begin
+        compressed_data = unsafe_wrap(Vector{UInt8}, pointer(compressed_data_buffer), data_page_header.compressed_page_size)
+        uncompressed_data = decompress_with_codec(compressed_data, codec)
+    end
+
     @assert length(uncompressed_data) == data_page_header.uncompressed_page_size
 
     # this is made up of these 3 things written back to back
     # * repetition levels - can be ignored for unnested data
     # * definition levels -
     # * values
-
     uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false)
 
     # this will be set in future
@@ -158,10 +191,11 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     # the number of values stored in this page
     num_values = data_page_header.data_page_header.num_values
 
+    # initialise it to something
+    missing_bytes = Vector{UInt8}(undef, num_values)
+    missing_bytes_io = IOBuffer(missing_bytes, write=true)
+
     # definition levels
-    # do_read_defn_lvls = isfilled(data_page_header.data_page_header, :statistics) &&
-    #     isfilled(data_page_header.data_page_header.statistics, :null_count) &&
-    #     data_page_header.data_page_header.statistics.null_count > 0
     if data_page_header.data_page_header.definition_level_encoding == PAR2.Encoding.RLE
         # for unnested columns the highest possible value for definiton is 1
         # which can represented with just one bit so the bit width is always 1
@@ -173,13 +207,9 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
 
         pos_after_reading_encoded_data = pos_before_encoded_data
 
-        # initialise it to something
-        missing_bytes = UInt8[]
-
         while (pos_after_reading_encoded_data - pos_before_encoded_data) < encoded_data_len
             encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
 
-            # TODO it's possible to be mixing RLE and bitpacked in one algorithm
             if iseven(encoded_data_header)
                 # RLE encoded
                 rle_len = Int(encoded_data_header >> 1)
@@ -198,11 +228,11 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                     end
                 end
 
-                append!(missing_bytes, fill(rle_val, rle_len))
+                write(missing_bytes_io, fill(rle_val, rle_len))
 
                 from_defn += rle_len
-                @assert from_defn - from == length(missing_bytes)
-                @assert length(missing_bytes) <= num_values
+                @assert from_defn - from == position(missing_bytes_io)
+                @assert position(missing_bytes_io) <= num_values
             else
                 # the only reaosn to use bitpacking is because there are missings
                 has_missing = true
@@ -239,10 +269,10 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 len_to_write = new_from_defn - from_defn
 
                 if len_to_write == len_of_tmp_missing_bytes
-                    append!(missing_bytes, tmp_missing_bytes)
+                    write(missing_bytes_io, tmp_missing_bytes)
                 elseif len_to_write < len_of_tmp_missing_bytes
                     tmp_missing_bytes_smaller = unsafe_wrap(Vector{UInt8}, pointer(tmp_missing_bytes), len_to_write)
-                    append!(missing_bytes, tmp_missing_bytes_smaller)
+                    write(missing_bytes_io, tmp_missing_bytes_smaller)
                 else
                     error("something is wrong")
                 end
@@ -281,7 +311,7 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     @assert pos_after_reading_encoded_data - pos_before_encoded_data == encoded_data_len
 
     if has_missing
-        @assert length(missing_bytes) == num_values
+        @assert position(missing_bytes_io) == num_values
     end
 
 
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index bcbf2d9..9834207 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -1,7 +1,21 @@
 using Parquet
-using Parquet:TYPES, read_thrift, PAR2, BitPackedIterator, decompress_with_codec
-using Thrift: isfilled
-using Snappy, CodecZlib, CodecZstd
+
+path = "c:/data/Performance_2003Q3.txt.parquet"
+@time Parquet.read_column(path, 1);
+
+
+@time read_parquet(path);
+
+path = "c:/git/parquet-data-collection/dsd50p.parquet"
+@time adf = read_parquet(path);
+
+@time adf = read_parquet(path, multithreaded=false);
+
+
+
+using JDF: type_compress!
+
+@time adf = type_compress!(DataFrame(read_parquet(path, multithreaded=false), copycols=false));
 
 using Random: randstring
 tbl = (
@@ -22,32 +36,17 @@ tbl = (
 tmpfile = tempname()*".parquet"
 
 @time write_parquet(tmpfile, tbl);
+
 path = tmpfile
+@time adf = read_parquet(path);
+
+all([all(c1 .=== c2) for (c1, c2) in zip(tbl, adf)])
 
-col_num=12
-@time col1 = Parquet.read_column(path, col_num);
-all(col1 .=== tbl.stringm)
 
-a = read_parquet(path)
 
 using BenchmarkTools
 @benchmark adf = read_parquet(path)
 
-
-adf
-
-
-
-
-
-path = "c:/git/parquet-data-collection/dsd50p.parquet"
-path = "c:/data/Performance_2003Q3.txt.parquet"
-
-@time adf = read_parquet(path);
-
-adf.V5
-
-
 col_num = 1
 @time col1 = Parquet.read_column(path, col_num);
 col1
diff --git a/src/read_parquet.jl b/src/read_parquet.jl
index de953a9..68488ec 100644
--- a/src/read_parquet.jl
+++ b/src/read_parquet.jl
@@ -1,6 +1,6 @@
 using Base.Threads: @spawn
 using Base.Iterators: drop
-using ProgressMeter: @showprogress
+using ProgressMeter: @showprogress, Progress, next!
 using NamedTupleTools: namedtuple
 
 read_parquet(path, cols::Vector{Symbol}; kwargs...) = read_parquet(path, String.(cols); kwargs...)
@@ -25,25 +25,28 @@ function read_parquet(path, cols::Vector{String}; multithreaded=true, verbose =
 
 	results = Vector{Any}(undef, length(colnums))
 
-	filemetadata = metadata(path)
+    filemetadata = metadata(path)
 
-	if multithreaded
+    symbol_col_names = collect(Symbol(col) for col in colnames[colnums])
+
+    p = Progress(length(colnums))
+    if multithreaded
 		for (i, j) in enumerate(colnums)
-			results[i] = @spawn read_column(path, filemetadata, j)
-		end
-	else
-		@showprogress for (i, j) in enumerate(colnums)
-			results[i] = read_column(path, filemetadata, j)
+            results[i] = @spawn begin
+                # next!(p)
+                res = read_column(path, filemetadata, j)
+                res
+            end
+        end
+        results = fetch.(results)
+    else
+
+		for (i, j) in enumerate(colnums)
+            results[i] = read_column(path, filemetadata, j)
+            next!(p)
 		end
 	end
 
-	symbol_col_names = collect(Symbol(col) for col in colnames[colnums])
-
-    if multithreaded
-        @showprogress for i in 1:length(results)
-            results[i] = fetch(results[i])
-        end
-    end
 
     return namedtuple(symbol_col_names, results)
 end

From f6d2309c4df8cd97e40bd7e42e8e87ef1ebca400 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Sat, 30 May 2020 11:43:51 +1000
Subject: [PATCH 52/52] before major operation on cutting down on memory usage
 for missing

---
 src/column_reader.jl     | 82 ++++++++++++++++++++++++++++------------
 src/column_reader_dev.jl |  8 +++-
 2 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/src/column_reader.jl b/src/column_reader.jl
index 34da73a..e0a2eec 100644
--- a/src/column_reader.jl
+++ b/src/column_reader.jl
@@ -1,4 +1,4 @@
-import Base: iterate, length, IteratorSize, IteratorEltype, eltype
+import Base: iterate, length, IteratorSize, IteratorEltype, eltype, @_gc_preserve_begin, @_gc_preserve_end
 
 const TYPES = (Bool, Int32, Int64, Int128, Float32, Float64, String, UInt8)
 
@@ -43,9 +43,9 @@ function iterate(bp::BitPackedIterator, state)
     (value & UInt(2^bp.bitwidth-1), state + 1)
 end
 
-function decompress_with_codec(compressed_data::Vector{UInt8}, codec)::Vector{UInt8}
+function decompress_with_codec!(uncompressed_data::Vector{UInt8}, compressed_data::Vector{UInt8}, codec)
     if codec == PAR2.CompressionCodec.SNAPPY
-        uncompressed_data = Snappy.uncompress(compressed_data)
+        Snappy.snappy_uncompress(compressed_data, uncompressed_data)
     else
         error("codedc $codec unsupported atm")
     end
@@ -70,18 +70,17 @@ function read_column(path, filemetadata, col_num)
     end
     close(par)
 
-
     fileio = open(path)
 
     # I thnk there is a bug with Julia's multithreaded reads
     # which can be fixed by doing the below
     # DO NOT remove the code below or multithreading will fail
     println("$(position(fileio))")
-    if true
-        not_used = open(tempname()*string(col_num), "w")
-        write(not_used, position(fileio))
-        close(not_used)
-    end
+    # if true
+    # not_used = open(tempname()*string(col_num), "w")
+    # write(not_used, position(fileio))
+    # close(not_used)
+    # end
 
     # to reduce allocations we make a compressed_data array to store compressed data
     compressed_data_buffer = Vector{UInt8}(undef, 100)
@@ -105,7 +104,9 @@ function read_column(path, filemetadata, col_num)
             end
             # compressed_data = read(fileio, dict_page_header.compressed_page_size)
 
-            uncompressed_data = decompress_with_codec(compressed_data, colchunk_meta.codec)
+            uncompressed_data = Vector{UInt8}(undef, dict_page_header.uncompressed_page_size)
+
+            decompress_with_codec!(uncompressed_data, compressed_data, colchunk_meta.codec)
             @assert length(uncompressed_data) == dict_page_header.uncompressed_page_size
 
             if dict_page_header.dictionary_page_header.encoding == PAR2.Encoding.PLAIN_DICTIONARY
@@ -137,10 +138,12 @@ function read_column(path, filemetadata, col_num)
         # seek to the first data page
         seek(fileio, colchunk_meta.data_page_offset)
 
+        # the buffer is resizable and is used to reduce the amount of allocations
+        uncompressed_data_buffer = Vector{UInt8}(undef, 1048584)
 
         # repeated read data page
         while (from - last_from  < row_group.num_rows) & (from <= length(res))
-            from = read_data_page_vals!(res, fileio, dict, colchunk_meta.codec, T, from)
+            from = read_data_page_vals!(res, uncompressed_data_buffer, fileio, dict, colchunk_meta.codec, T, from)
 
             if from isa Tuple
                 return from
@@ -158,7 +161,7 @@ function read_column(path, filemetadata, col_num)
     res
 end
 
-function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integer = 1)
+function read_data_page_vals!(res, uncompressed_data_buffer::Vector{UInt8}, fileio::IOStream, dict, codec, T, from::Integer = 1)
     """
     Read one data page
     """
@@ -168,29 +171,44 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
 
     data_page_header = read_thrift(fileio, PAR2.PageHeader)
 
+    # the number of values stored in this page
+    num_values = data_page_header.data_page_header.num_values
+    # read values
+    to = from + num_values - 1
+    @assert to <= res_len
+
     #compressed_data = read(fileio, data_page_header.compressed_page_size)
-    compressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.compressed_page_size*1.5))
+    compressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.compressed_page_size))
 
     readbytes!(fileio, compressed_data_buffer, data_page_header.compressed_page_size)
-    GC.@preserve compressed_data_buffer begin
+
+    # resize the buffer if it's too small
+    if data_page_header.uncompressed_page_size > length(uncompressed_data_buffer)
+        uncompressed_data_buffer = Vector{UInt8}(undef, ceil(Int, data_page_header.uncompressed_page_size*1.1))
+    end
+
+    t1 = @_gc_preserve_begin uncompressed_data_buffer
+
+    GC.@preserve compressed_data_buffer uncompressed_data_buffer begin
         compressed_data = unsafe_wrap(Vector{UInt8}, pointer(compressed_data_buffer), data_page_header.compressed_page_size)
-        uncompressed_data = decompress_with_codec(compressed_data, codec)
+        uncompressed_data = unsafe_wrap(Vector{UInt8}, pointer(uncompressed_data_buffer), data_page_header.uncompressed_page_size)
+        # uncompressed_data = Vector{UInt8}(undef, data_page_header.uncompressed_page_size)
+        # decompression seems to be quite slow and uses lots of RAM!
+        decompress_with_codec!(uncompressed_data, compressed_data, codec)
     end
 
     @assert length(uncompressed_data) == data_page_header.uncompressed_page_size
 
+    uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false)
+
     # this is made up of these 3 things written back to back
     # * repetition levels - can be ignored for unnested data
     # * definition levels -
     # * values
-    uncompressed_data_io = IOBuffer(uncompressed_data, read=true, write=false, append=false)
 
     # this will be set in future
     has_missing = false
 
-    # the number of values stored in this page
-    num_values = data_page_header.data_page_header.num_values
-
     # initialise it to something
     missing_bytes = Vector{UInt8}(undef, num_values)
     missing_bytes_io = IOBuffer(missing_bytes, write=true)
@@ -222,6 +240,8 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                 else
                     # fill the memory location with all missing
                     GC.@preserve res begin
+                        # TODO there is a better way to locate the missing bytes
+                        # find the location of missing
                         dest_ptr = Ptr{UInt8}(pointer(res, res_len+1)) + from_defn - 1
                         tmparray = unsafe_wrap(Vector{UInt8}, dest_ptr, rle_len)
                         fill!(tmparray, rle_val)
@@ -315,12 +335,8 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
     end
 
 
-    # read values
-    to = from + num_values - 1
-    @assert to <= res_len
 
     if data_page_header.data_page_header.encoding == PAR2.Encoding.PLAIN
-        # println("meh")
         # just return the data as is
         if T == Bool
             if has_missing
@@ -381,7 +397,13 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
 
         else
             if has_missing
-                raw_data = reinterpret(T, read(uncompressed_data_io))
+                # raw_data = reinterpret(T, read(uncompressed_data_io))
+                arr_pos = position(uncompressed_data_io) + 1
+                # seek till the end
+                seek(uncompressed_data_io, uncompressed_data_io.size + 1)
+                # TODO remove this allocation too
+                ok = uncompressed_data[arr_pos:end]
+                raw_data =  reinterpret(T, ok)
                 j = 1
                 for (i, missing_byte) in zip(from:to, missing_bytes)
                     if missing_byte == 1
@@ -411,10 +433,15 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
         # the documented max bitwidth is
         @assert bitwidth <= 32
 
+        rle_cnt = 0
+        bp_cnt = 0
+        rle_size = 0
+        bp_size = 0
         while !eof(uncompressed_data_io)
             encoded_data_header = Parquet._read_varint(uncompressed_data_io, UInt32)
 
             if iseven(encoded_data_header)
+                rle_cnt += 1
                 # RLE encoded
                 rle_len = Int(encoded_data_header >> 1)
                 rle_val_vec::Vector{UInt8} = read(uncompressed_data_io, ceil(Int, bitwidth/8))
@@ -436,8 +463,10 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                     res[from:min(to, from + rle_len - 1)] .= dict[rle_val+1]
                 end
 
+                rle_size += rle_len
                 from = from + rle_len
             else
+                bp_cnt += 1
                 # bitpacked encoded
                 bit_pack_len = Int(encoded_data_header >> 1)
                 @assert (bit_pack_len >= 1) && (bit_pack_len <= 2^31 - 1)
@@ -465,13 +494,16 @@ function read_data_page_vals!(res, fileio::IOStream, dict, codec, T, from::Integ
                     end
                 end
 
-
+                bp_size += l
                 from = from + l
             end
         end
+        # println("rle_cnt $rle_cnt bp_cnt $bp_cnt rle_size $rle_size bp_size $bp_size")
     else
         erorr("encoding not supported")
     end
 
+     @_gc_preserve_end t2
+
     return to
 end
diff --git a/src/column_reader_dev.jl b/src/column_reader_dev.jl
index 9834207..eb5e623 100644
--- a/src/column_reader_dev.jl
+++ b/src/column_reader_dev.jl
@@ -1,7 +1,13 @@
 using Parquet
 
 path = "c:/data/Performance_2003Q3.txt.parquet"
-@time Parquet.read_column(path, 1);
+#Parquet.metadata(path)
+@time col = Parquet.read_column(path, 5);
+
+for i in 1:31
+    println(i)
+    @time Parquet.read_column(path, i);
+end
 
 
 @time read_parquet(path);