Skip to content

Commit

Permalink
Merge pull request #118 from JuliaIO/tan/fixoffsetstart
Browse files Browse the repository at this point in the history
fix loading parquet file with a start offset
  • Loading branch information
tanmaykm authored Dec 1, 2020
2 parents 438dcbe + 8190eb8 commit 5865a32
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/cursor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ function BatchedColumnsCursor(par::Parquet.File;
error("nested schemas are not supported with BatchedColumnsCursor yet")
end

colcursors = [ColCursor(par, colname) for colname in colnames(par)]
colcursors = [ColCursor(par, colname; rows=rows) for colname in colnames(par)]
rectype = ntcolstype(sch, sch.schema[1])
nbatches = ceil(Int, length(rows)/batchsize)
colbuffs = Union{Nothing,Vector}[nothing for idx in 1:length(colcursors)]
Expand Down
18 changes: 18 additions & 0 deletions test/test_load.jl
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,28 @@ function test_load_multiple_rowgroups()
end
end

function test_load_at_offset()
@testset "load file at offset" begin
testfolder = joinpath(@__DIR__, "parquet-compatibility")
testfile = joinpath(testfolder, "parquet-testdata", "impala", "1.1.1-NONE", "customer.impala.parquet")
parquet_file = Parquet.File(testfile)

vals_20000_40000 = first(collect(Parquet.BatchedColumnsCursor(parquet_file; rows=20000:40000))).c_custkey
vals_1_40000 = first(collect(Parquet.BatchedColumnsCursor(parquet_file; rows=1:40000))).c_custkey
vals_2_40001 = first(collect(Parquet.BatchedColumnsCursor(parquet_file; rows=2:40001))).c_custkey

@test vals_20000_40000 == vals_1_40000[20000:40000]
@test vals_20000_40000 != vals_1_40000[1:20000]
@test vals_2_40001[1:10000] == vals_1_40000[2:10001]
@test vals_2_40001[1:10000] != vals_1_40000[1:10000]
end
end

@testset "load files" begin
test_load_all_pages()
test_decode_all_pages()
test_load_boolean_and_ts()
test_load_nested()
test_load_multiple_rowgroups()
test_load_at_offset()
end

2 comments on commit 5865a32

@tanmaykm
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/25665

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.7.1 -m "<description of version>" 5865a32d557f878d25313e357b69b1bf4a61fad6
git push origin v0.7.1

Please sign in to comment.