From 005c94612becd9e707efa758cd4ddc2212e1c7a4 Mon Sep 17 00:00:00 2001 From: Jacob Quinn Date: Thu, 15 Apr 2021 22:49:44 -0600 Subject: [PATCH] Implement C Data integration This starts work towards supporting teh C data interface for the arrow format, as documented [here](https://arrow.apache.org/docs/format/CDataInterface.html#). Currently in this PR, it includes struct definitions and basic methods to allow getting a pointer to an `ArrowSchema`/`ArrowArray` C-compatible struct that can then be populated by another implementation. For example, with this PR, you can do: ```julia using Arrow, PyCall pd = pyimport("pandas") pa = pyimport("pyarrow") df = pd.DataFrame(py"""{'a': [1, 2, 3, 4, 5], 'b': ['a', 'b', 'c', 'd', 'e']}"""o) rb = pa.record_batch(df) sch = Arrow.CData.getschema() do ptr rb.schema._export_to_c(Int(ptr)) end arr = Arrow.CData.getarray() do ptr rb._export_to_c(Int(ptr)) end ``` Currently, these `ArrowSchema`/`ArrowArray` structs are pretty bare bones, but it at least lays some ground work for integration. Things we still need/want to make all this nicer to use/work with: * Type format string parsing/converting: we need to parse the type format strings as outlined [here](https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings) to figure out what type of data we'll get in the arrays. It'd probably be best to add a `type` field to the ArrowSchema struct that we'd populate when converting from `CArrowSchema` -> `ArrowSchema` * Add a method like `Arrow.ArrowVector(::ArrowSchema, ::ArrowArray)` that produced a concrete `ArrowVector` subtype, like `Arrow.Primitive`, `Arrow.List`, etc. This will be a bit tricky, because have to follow all the same columnar layout trickery that we currently handle for IPC in the table.jl `build` methods. Perhaps we can refactor all that so we can re-use some code? Otherwise, we might just need to reimplement a bunch of that logic specific to converting `ArrrowArray`s. * That should give a robust consuming story; for producing, we probably need a definition like `Arrow.ArrowSchema(a::Arrow.ArrowVector)` that produced a valid `ArrowSchema`, and then overloads per `ArrowVector` subtype like `Arrow.ArrowArray(x::Arrow.Primitive)` that produced the right `ArrowArray` for a concrete arrow array * Then the last piece we need is just figuring out the right mechanics for providing a pointer to the `CArrowSchema`, `CArrowArray` structs once they're populated If anyone would like to help out, I'm happy to provide as much guidance as possible so others can get their feet wet in some arrow spec nitty-gritty. --- src/Arrow.jl | 1 + src/cinterface.jl | 165 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 src/cinterface.jl diff --git a/src/Arrow.jl b/src/Arrow.jl index 323a3b8e..4eac1762 100644 --- a/src/Arrow.jl +++ b/src/Arrow.jl @@ -92,6 +92,7 @@ include("arraytypes/arraytypes.jl") include("eltypes.jl") include("table.jl") include("write.jl") +include("cinterface.jl") const LZ4_FRAME_COMPRESSOR = LZ4FrameCompressor[] const ZSTD_COMPRESSOR = ZstdCompressor[] diff --git a/src/cinterface.jl b/src/cinterface.jl new file mode 100644 index 00000000..a665e7aa --- /dev/null +++ b/src/cinterface.jl @@ -0,0 +1,165 @@ +module CData + +export ArrowSchema, ArrowArray, getschema, getarray + +const ARROW_FLAG_DICTIONARY_ORDERED = 1 +const ARROW_FLAG_NULLABLE = 2 +const ARROW_FLAG_MAP_KEYS_SORTED = 4 + +struct CArrowSchema + format::Ptr{UInt8} + name::Ptr{UInt8} + metadata::Ptr{UInt8} + flags::Int64 + n_children::Int64 + children::Ptr{Ptr{CArrowSchema}} + dictionary::Ptr{CArrowSchema} + release::Ptr{Cvoid} + private_data::Ptr{Cvoid} +end + +CArrowSchema() = CArrowSchema(C_NULL, C_NULL, C_NULL, 0, 0, C_NULL, C_NULL, _CNULL, C_NULL) + +Base.propertynames(::CArrowSchema) = (:format, :name, :metadata, :flags, :n_children, :children, :dictionary) + +function readmetadata(ptr::Ptr{UInt8}) + pos = 1 + meta = Dict{String, String}() + if ptr != C_NULL + n_entries = unsafe_load(convert(Ptr{Int32}, ptr)) + ptr += 4 + for _ = 1:n_entries + keylen = unsafe_load(convert(Ptr{Int32}, ptr)) + ptr += 4 + key = unsafe_string(ptr, keylen) + ptr += keylen + vallen = unsafe_load(convert(Ptr{Int32}, ptr)) + ptr += 4 + val = unsafe_string(ptr, vallen) + ptr += vallen + meta[key] = val + end + end + return meta +end + +function Base.getproperty(x::CArrowSchema, nm::Symbol) + if nm === :format + return unsafe_string(getfield(x, :format)) + elseif nm === :name + return unsafe_string(getfield(x, :name)) + elseif nm === :metadata + return readmetadata(getfield(x, :metadata)) + elseif nm === :flags + return getfield(x, :flags) + elseif nm === :n_children + return getfield(x, :n_children) + elseif nm === :children + c = getfield(x, :children) + return c == C_NULL ? CArrowSchema[] : unsafe_wrap(Array, unsafe_load(c), getfield(x, :n_children)) + elseif nm === :dictionary + d = getfield(x, :dictionary) + return d == C_NULL ? nothing : unsafe_load(d) + end + error("unknown property requested: $nm") +end + +mutable struct ArrowSchema + format::String + name::String + metadata::Dict{String, String} + flags::Int64 + n_children::Int64 + children::Vector{ArrowSchema} + dictionary::Union{Nothing, ArrowSchema} + carrowschema::Ref{CArrowSchema} +end + +ArrowSchema(s::Ref{CArrowSchema}) = ArrowSchema(s[].format, s[].name, s[].metadata, s[].flags, s[].n_children, map(ArrowSchema, s[].children), s[].dictionary === nothing ? nothing : ArrowSchema(s[].dictionary), s) +ArrowSchema(s::CArrowSchema) = ArrowSchema(s.format, s.name, s.metadata, s.flags, s.n_children, map(ArrowSchema, s.children), s.dictionary === nothing ? nothing : ArrowSchema(s.dictionary), Ref{CArrowSchema}()) + +function getschema(f) + schref = Ref{CArrowSchema}() + ptr = Base.unsafe_convert(Ptr{CArrowSchema}, schref) + f(ptr) + sch = ArrowSchema(schref) + finalizer(sch) do x + r = getfield(x.carrowschema[], :release) + if r != C_NULL + ccall(r, Cvoid, (Ptr{CArrowSchema},), x.carrowschema) + end + end + return sch +end + +struct CArrowArray + length::Int64 + null_count::Int64 + offset::Int64 + n_buffers::Int64 + n_children::Int64 + buffers::Ptr{Ptr{UInt8}} + children::Ptr{Ptr{CArrowArray}} + dictionary::Ptr{CArrowArray} + release::Ptr{Cvoid} + private_data::Ptr{Cvoid} +end + +CArrowArray() = CArrowArray(0, 0, 0, 0, 0, C_NULL, C_NULL, C_NULL, C_NULL, C_NULL) + +Base.propertynames(::CArrowArray) = (:length, :null_count, :offset, :n_buffers, :n_children, :buffers, :children, :dictionary) + +function Base.getproperty(x::CArrowArray, nm::Symbol) + if nm === :length + return getfield(x, :length) + elseif nm === :null_count + return getfield(x, :null_count) + elseif nm === :offset + return getfield(x, :offset) + elseif nm === :n_buffers + return getfield(x, :n_buffers) + elseif nm === :n_children + return getfield(x, :n_children) + elseif nm === :buffers + b = getfield(x, :buffers) + return b == C_NULL ? Ptr{UInt8}[] : unsafe_wrap(Array, b, getfield(x, :n_buffers)) + elseif nm === :children + c = getfield(x, :children) + return c == C_NULL ? CArrowArray[] : unsafe_wrap(Array, unsafe_load(c), getfield(x, :n_children)) + elseif nm === :dictionary + d = getfield(x, :dictionary) + return d == C_NULL ? nothing : unsafe_load(d) + end + error("unknown property requested: $nm") +end + +mutable struct ArrowArray + length::Int64 + null_count::Int64 + offset::Int64 + n_buffers::Int64 + n_children::Int64 + buffers::Vector{Ptr{UInt8}} + children::Vector{ArrowArray} + dictionary::Union{Nothing, ArrowArray} + carrowarray::Ref{CArrowArray} +end + +ArrowArray(a::Ref{CArrowArray}) = ArrowArray(a[].length, a[].null_count, a[].offset, a[].n_buffers, a[].n_children, a[].buffers, map(ArrowArray, a[].children), a[].dictionary === nothing ? nothing : ArrowArray(a[].dictionary), a) +ArrowArray(a::CArrowArray) = ArrowArray(a.length, a.null_count, a.offset, a.n_buffers, a.n_children, a.buffers, map(ArrowArray, a.children), a.dictionary === nothing ? nothing : ArrowArray(a.dictionary), Ref{CArrowArray}()) + +function getarray(f) + arrref = Ref{CArrowArray}() + ptr = Base.unsafe_convert(Ptr{CArrowArray}, arrref) + f(ptr) + arr = ArrowArray(arrref) + finalizer(arr) do x + r = getfield(x.carrowarray[], :release) + if r != C_NULL + ccall(r, Cvoid, (Ptr{CArrowArray},), x.carrowarray) + end + end + return arr +end + +end # module