From 8d5ad9b10e6c520149b24b37d187bdcd59ad81e0 Mon Sep 17 00:00:00 2001 From: Kevin Squire Date: Fri, 17 Nov 2017 21:22:53 -0500 Subject: [PATCH] Allow user to specify integer parsing type (fixes #223) (#224) * Allow user to specify integer parsing type * Added a ParserContext object, containing the dicttype and inttype to use for parsing JSON * Use keytype instead of accessing DictType.parameters[1] * four space indent * Rearrange function argument order to put ParserContext first * These are unexported, internal functions, and this ordering makes more sense. * Test inttype=BigInt as well * Update README with `inttype` information. --- README.md | 14 ++++++++-- src/Parser.jl | 63 ++++++++++++++++++++++++++---------------- test/parser/inttype.jl | 16 +++++++++++ test/runtests.jl | 4 +++ 4 files changed, 70 insertions(+), 27 deletions(-) create mode 100644 test/parser/inttype.jl diff --git a/README.md b/README.md index d9f892c..4365b8b 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,9 @@ json(a::Any) Returns a compact JSON representation as an `AbstractString`. ```julia -JSON.parse(s::AbstractString; dicttype=Dict) -JSON.parse(io::IO; dicttype=Dict) -JSON.parsefile(filename::AbstractString; dicttype=Dict, use_mmap=true) +JSON.parse(s::AbstractString; dicttype=Dict, inttype=Int64) +JSON.parse(io::IO; dicttype=Dict, inttype=Int64) +JSON.parsefile(filename::AbstractString; dicttype=Dict, inttype=Int64, use_mmap=true) ``` Parses a JSON `AbstractString` or IO stream into a nested `Array` or `Dict`. @@ -70,6 +70,14 @@ package](https://github.com/JuliaLang/DataStructures.jl) is installed), you can pass `dicttype=DataStructures.OrderedDict` to maintain the insertion order of the items in the object. +The `inttype` argument controls how integers are parsed. If a number in a JSON +file is recognized to be an integer, it is parsed as one; otherwise it is parsed +as a `Float64`. The `inttype` defaults to `Int64`, but, for example, if you know +that your integer numbers are all small and want to save space, you can pass +`inttype=Int32`. Alternatively, if your JSON input has integers which are too large +for Int64, you can pass `inttype=Int128` or `inttype=BigInt`. `inttype` can be any +subtype of `Real`. + ```julia JSON.lower(p::Point2D) = [p.x, p.y] ``` diff --git a/src/Parser.jl b/src/Parser.jl index b4477fd..fc7b3ec 100644 --- a/src/Parser.jl +++ b/src/Parser.jl @@ -30,6 +30,8 @@ mutable struct StreamingParserState{T <: IO} <: ParserState end StreamingParserState(io::IO) = StreamingParserState(io, 0x00, true) +struct ParserContext{DictType, IntType} end + """ Return the byte at the current position of the `ParserState`. If there is no byte (that is, the `ParserState` is done), then an error is thrown that the @@ -146,18 +148,18 @@ end Given a `ParserState`, after possibly any amount of whitespace, return the next parseable value. """ -function parse_value(ps::ParserState, dictT::Type) +function parse_value(pc::ParserContext, ps::ParserState) chomp_space!(ps) @inbounds byte = byteat(ps) if byte == STRING_DELIM parse_string(ps) elseif isjsondigit(byte) || byte == MINUS_SIGN - parse_number(ps) + parse_number(pc, ps) elseif byte == OBJECT_BEGIN - parse_object(ps, dictT) + parse_object(pc, ps) elseif byte == ARRAY_BEGIN - parse_array(ps, dictT) + parse_array(pc, ps) else parse_jsconstant(ps::ParserState) end @@ -179,13 +181,13 @@ function parse_jsconstant(ps::ParserState) end end -function parse_array(ps::ParserState, dictT::Type) +function parse_array(pc::ParserContext, ps::ParserState) result = Any[] @inbounds incr!(ps) # Skip over opening '[' chomp_space!(ps) if byteat(ps) ≠ ARRAY_END # special case for empty array @inbounds while true - push!(result, parse_value(ps, dictT)) + push!(result, parse_value(pc, ps)) chomp_space!(ps) byteat(ps) == ARRAY_END && break skip!(ps, DELIMITER) @@ -197,9 +199,9 @@ function parse_array(ps::ParserState, dictT::Type) end -function parse_object(ps::ParserState, dictT::Type) - obj = dictT() - keyT = dictT.parameters[1] +function parse_object(pc::ParserContext{DictType, <:Real}, ps::ParserState) where DictType + obj = DictType() + keyT = keytype(DictType) incr!(ps) # Skip over opening '{' chomp_space!(ps) @@ -212,7 +214,7 @@ function parse_object(ps::ParserState, dictT::Type) chomp_space!(ps) skip!(ps, SEPARATOR) # Read value - value = parse_value(ps, dictT) + value = parse_value(pc, ps) chomp_space!(ps) obj[convert(keyT, key)] = value byteat(ps) == OBJECT_END && break @@ -313,17 +315,25 @@ end Parse an integer from the given bytes vector, starting at `from` and ending at the byte before `to`. Bytes enclosed should all be ASCII characters. """ -function int_from_bytes(bytes::Vector{UInt8}, from::Int, to::Int) +function int_from_bytes(pc::ParserContext{<:Associative,IntType}, + ps::ParserState, + bytes::Vector{UInt8}, + from::Int, + to::Int) where IntType <: Real @inbounds isnegative = bytes[from] == MINUS_SIGN ? (from += 1; true) : false - num = Int64(0) + num = IntType(0) @inbounds for i in from:to - num = Int64(10) * num + Int64(bytes[i] - DIGIT_ZERO) + num = IntType(10) * num + IntType(bytes[i] - DIGIT_ZERO) end ifelse(isnegative, -num, num) end -function number_from_bytes( - ps::ParserState, isint::Bool, bytes::Vector{UInt8}, from::Int, to::Int) +function number_from_bytes(pc::ParserContext, + ps::ParserState, + isint::Bool, + bytes::Vector{UInt8}, + from::Int, + to::Int) @inbounds if hasleadingzero(bytes, from, to) _error(E_LEADING_ZERO, ps) end @@ -332,7 +342,7 @@ function number_from_bytes( @inbounds if to == from && bytes[from] == MINUS_SIGN _error(E_BAD_NUMBER, ps) end - int_from_bytes(bytes, from, to) + int_from_bytes(pc, ps, bytes, from, to) else res = float_from_bytes(bytes, from, to) isnull(res) ? _error(E_BAD_NUMBER, ps) : get(res) @@ -340,7 +350,7 @@ function number_from_bytes( end -function parse_number(ps::ParserState) +function parse_number(pc::ParserContext, ps::ParserState) # Determine the end of the floating point by skipping past ASCII values # 0-9, +, -, e, E, and . number = UInt8[] @@ -361,7 +371,7 @@ function parse_number(ps::ParserState) incr!(ps) end - number_from_bytes(ps, isint, number, 1, length(number)) + number_from_bytes(pc, ps, isint, number, 1, length(number)) end @@ -370,9 +380,10 @@ function unparameterize_type(T::Type) candidate <: Union{} ? T : candidate end -function parse(str::AbstractString; dicttype::Type{<:Associative}=Dict{String,Any}) +function parse(str::AbstractString; dicttype::Type{<:Associative}=Dict{String,Any}, inttype::Type{<:Real}=Int64) + pc = ParserContext{unparameterize_type(dicttype), inttype}() ps = MemoryParserState(Vector{UInt8}(String(str)), 1) - v = parse_value(ps, unparameterize_type(dicttype)) + v = parse_value(pc, ps) chomp_space!(ps) if hasmore(ps) _error(E_EXPECTED_EOF, ps) @@ -380,16 +391,20 @@ function parse(str::AbstractString; dicttype::Type{<:Associative}=Dict{String,An v end -function parse(io::IO; dicttype::Type{<:Associative}=Dict{String,Any}) +function parse(io::IO; dicttype::Type{<:Associative}=Dict{String,Any}, inttype::Type{<:Real}=Int64) + pc = ParserContext{unparameterize_type(dicttype), inttype}() ps = StreamingParserState(io) - parse_value(ps, unparameterize_type(dicttype)) + parse_value(pc, ps) end -function parsefile(filename::AbstractString; dicttype::Type{<:Associative}=Dict{String, Any}, use_mmap=true) +function parsefile(filename::AbstractString; + dicttype::Type{<:Associative}=Dict{String, Any}, + inttype::Type{<:Real}=Int64, + use_mmap=true) sz = filesize(filename) open(filename) do io s = use_mmap ? String(Mmap.mmap(io, Vector{UInt8}, sz)) : read(io, String) - parse(s; dicttype=dicttype) + parse(s; dicttype=dicttype, inttype=inttype) end end diff --git a/test/parser/inttype.jl b/test/parser/inttype.jl new file mode 100644 index 0000000..30e9ca1 --- /dev/null +++ b/test/parser/inttype.jl @@ -0,0 +1,16 @@ +@testset for T in [Int32, Int64, Int128, BigInt] + val = JSON.parse("{\"x\": 3}", inttype=T) + @test isa(val, Dict{String, Any}) + @test length(val) == 1 + key = collect(keys(val))[1] + @test string(key) == "x" + value = val[key] + @test value == 3 + @test typeof(value) == T +end + +@testset begin + teststr = """{"201736327611975630": 18005722827070440994}""" + val = JSON.parse(teststr, inttype=Int128) + @test val == Dict{String,Any}("201736327611975630"=> 18005722827070440994) +end diff --git a/test/runtests.jl b/test/runtests.jl index 15d6b75..e27310c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -20,6 +20,10 @@ include("json-samples.jl") include("parser/dicttype.jl") end + @testset "inttype" begin + include("parser/inttype.jl") + end + @testset "Miscellaneous" begin # test for single values @test JSON.parse("true") == true