From 8d5ad9b10e6c520149b24b37d187bdcd59ad81e0 Mon Sep 17 00:00:00 2001
From: Kevin Squire <kevin.squire@gmail.com>
Date: Fri, 17 Nov 2017 21:22:53 -0500
Subject: [PATCH] Allow user to specify integer parsing type (fixes #223)
 (#224)

* Allow user to specify integer parsing type

* Added a ParserContext object, containing the dicttype and
  inttype to use for parsing JSON

* Use keytype instead of accessing DictType.parameters[1]

* four space indent

* Rearrange function argument order to put ParserContext first

* These are unexported, internal functions, and this ordering makes more sense.

* Test inttype=BigInt as well

* Update README with `inttype` information.
---
 README.md              | 14 ++++++++--
 src/Parser.jl          | 63 ++++++++++++++++++++++++++----------------
 test/parser/inttype.jl | 16 +++++++++++
 test/runtests.jl       |  4 +++
 4 files changed, 70 insertions(+), 27 deletions(-)
 create mode 100644 test/parser/inttype.jl

diff --git a/README.md b/README.md
index d9f892c..4365b8b 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,9 @@ json(a::Any)
 Returns a compact JSON representation as an `AbstractString`.
 
 ```julia
-JSON.parse(s::AbstractString; dicttype=Dict)
-JSON.parse(io::IO; dicttype=Dict)
-JSON.parsefile(filename::AbstractString; dicttype=Dict, use_mmap=true)
+JSON.parse(s::AbstractString; dicttype=Dict, inttype=Int64)
+JSON.parse(io::IO; dicttype=Dict, inttype=Int64)
+JSON.parsefile(filename::AbstractString; dicttype=Dict, inttype=Int64, use_mmap=true)
 ```
 
 Parses a JSON `AbstractString` or IO stream into a nested `Array` or `Dict`.
@@ -70,6 +70,14 @@ package](https://github.com/JuliaLang/DataStructures.jl) is
 installed), you can pass `dicttype=DataStructures.OrderedDict` to
 maintain the insertion order of the items in the object.
 
+The `inttype` argument controls how integers are parsed.  If a number in a JSON
+file is recognized to be an integer, it is parsed as one; otherwise it is parsed 
+as a `Float64`.  The `inttype` defaults to `Int64`, but, for example, if you know
+that your integer numbers are all small and want to save space, you can pass 
+`inttype=Int32`.  Alternatively, if your JSON input has integers which are too large
+for Int64, you can pass `inttype=Int128` or `inttype=BigInt`.  `inttype` can be any
+subtype of `Real`.
+
 ```julia
 JSON.lower(p::Point2D) = [p.x, p.y]
 ```
diff --git a/src/Parser.jl b/src/Parser.jl
index b4477fd..fc7b3ec 100644
--- a/src/Parser.jl
+++ b/src/Parser.jl
@@ -30,6 +30,8 @@ mutable struct StreamingParserState{T <: IO} <: ParserState
 end
 StreamingParserState(io::IO) = StreamingParserState(io, 0x00, true)
 
+struct ParserContext{DictType, IntType} end
+
 """
 Return the byte at the current position of the `ParserState`. If there is no
 byte (that is, the `ParserState` is done), then an error is thrown that the
@@ -146,18 +148,18 @@ end
 Given a `ParserState`, after possibly any amount of whitespace, return the next
 parseable value.
 """
-function parse_value(ps::ParserState, dictT::Type)
+function parse_value(pc::ParserContext, ps::ParserState)
     chomp_space!(ps)
 
     @inbounds byte = byteat(ps)
     if byte == STRING_DELIM
         parse_string(ps)
     elseif isjsondigit(byte) || byte == MINUS_SIGN
-        parse_number(ps)
+        parse_number(pc, ps)
     elseif byte == OBJECT_BEGIN
-        parse_object(ps, dictT)
+        parse_object(pc, ps)
     elseif byte == ARRAY_BEGIN
-        parse_array(ps, dictT)
+        parse_array(pc, ps)
     else
         parse_jsconstant(ps::ParserState)
     end
@@ -179,13 +181,13 @@ function parse_jsconstant(ps::ParserState)
     end
 end
 
-function parse_array(ps::ParserState, dictT::Type)
+function parse_array(pc::ParserContext, ps::ParserState)
     result = Any[]
     @inbounds incr!(ps)  # Skip over opening '['
     chomp_space!(ps)
     if byteat(ps) ≠ ARRAY_END  # special case for empty array
         @inbounds while true
-            push!(result, parse_value(ps, dictT))
+            push!(result, parse_value(pc, ps))
             chomp_space!(ps)
             byteat(ps) == ARRAY_END && break
             skip!(ps, DELIMITER)
@@ -197,9 +199,9 @@ function parse_array(ps::ParserState, dictT::Type)
 end
 
 
-function parse_object(ps::ParserState, dictT::Type)
-    obj = dictT()
-    keyT = dictT.parameters[1]
+function parse_object(pc::ParserContext{DictType, <:Real}, ps::ParserState) where DictType
+    obj = DictType()
+    keyT = keytype(DictType)
 
     incr!(ps)  # Skip over opening '{'
     chomp_space!(ps)
@@ -212,7 +214,7 @@ function parse_object(ps::ParserState, dictT::Type)
             chomp_space!(ps)
             skip!(ps, SEPARATOR)
             # Read value
-            value = parse_value(ps, dictT)
+            value = parse_value(pc, ps)
             chomp_space!(ps)
             obj[convert(keyT, key)] = value
             byteat(ps) == OBJECT_END && break
@@ -313,17 +315,25 @@ end
 Parse an integer from the given bytes vector, starting at `from` and ending at
 the byte before `to`. Bytes enclosed should all be ASCII characters.
 """
-function int_from_bytes(bytes::Vector{UInt8}, from::Int, to::Int)
+function int_from_bytes(pc::ParserContext{<:Associative,IntType}, 
+                        ps::ParserState, 
+                        bytes::Vector{UInt8}, 
+                        from::Int, 
+                        to::Int) where IntType <: Real
     @inbounds isnegative = bytes[from] == MINUS_SIGN ? (from += 1; true) : false
-    num = Int64(0)
+    num = IntType(0)
     @inbounds for i in from:to
-        num = Int64(10) * num + Int64(bytes[i] - DIGIT_ZERO)
+        num = IntType(10) * num + IntType(bytes[i] - DIGIT_ZERO)
     end
     ifelse(isnegative, -num, num)
 end
 
-function number_from_bytes(
-        ps::ParserState, isint::Bool, bytes::Vector{UInt8}, from::Int, to::Int)
+function number_from_bytes(pc::ParserContext, 
+                           ps::ParserState, 
+                           isint::Bool, 
+                           bytes::Vector{UInt8}, 
+                           from::Int, 
+                           to::Int)
     @inbounds if hasleadingzero(bytes, from, to)
         _error(E_LEADING_ZERO, ps)
     end
@@ -332,7 +342,7 @@ function number_from_bytes(
         @inbounds if to == from && bytes[from] == MINUS_SIGN
             _error(E_BAD_NUMBER, ps)
         end
-        int_from_bytes(bytes, from, to)
+        int_from_bytes(pc, ps, bytes, from, to)
     else
         res = float_from_bytes(bytes, from, to)
         isnull(res) ? _error(E_BAD_NUMBER, ps) : get(res)
@@ -340,7 +350,7 @@ function number_from_bytes(
 end
 
 
-function parse_number(ps::ParserState)
+function parse_number(pc::ParserContext, ps::ParserState)
     # Determine the end of the floating point by skipping past ASCII values
     # 0-9, +, -, e, E, and .
     number = UInt8[]
@@ -361,7 +371,7 @@ function parse_number(ps::ParserState)
         incr!(ps)
     end
 
-    number_from_bytes(ps, isint, number, 1, length(number))
+    number_from_bytes(pc, ps, isint, number, 1, length(number))
 end
 
 
@@ -370,9 +380,10 @@ function unparameterize_type(T::Type)
     candidate <: Union{} ? T : candidate
 end
 
-function parse(str::AbstractString; dicttype::Type{<:Associative}=Dict{String,Any})
+function parse(str::AbstractString; dicttype::Type{<:Associative}=Dict{String,Any}, inttype::Type{<:Real}=Int64)
+    pc = ParserContext{unparameterize_type(dicttype), inttype}()
     ps = MemoryParserState(Vector{UInt8}(String(str)), 1)
-    v = parse_value(ps, unparameterize_type(dicttype))
+    v = parse_value(pc, ps)
     chomp_space!(ps)
     if hasmore(ps)
         _error(E_EXPECTED_EOF, ps)
@@ -380,16 +391,20 @@ function parse(str::AbstractString; dicttype::Type{<:Associative}=Dict{String,An
     v
 end
 
-function parse(io::IO; dicttype::Type{<:Associative}=Dict{String,Any})
+function parse(io::IO; dicttype::Type{<:Associative}=Dict{String,Any}, inttype::Type{<:Real}=Int64)
+    pc = ParserContext{unparameterize_type(dicttype), inttype}()
     ps = StreamingParserState(io)
-    parse_value(ps, unparameterize_type(dicttype))
+    parse_value(pc, ps)
 end
 
-function parsefile(filename::AbstractString; dicttype::Type{<:Associative}=Dict{String, Any}, use_mmap=true)
+function parsefile(filename::AbstractString; 
+                   dicttype::Type{<:Associative}=Dict{String, Any}, 
+                   inttype::Type{<:Real}=Int64, 
+                   use_mmap=true)
     sz = filesize(filename)
     open(filename) do io
         s = use_mmap ? String(Mmap.mmap(io, Vector{UInt8}, sz)) : read(io, String)
-        parse(s; dicttype=dicttype)
+        parse(s; dicttype=dicttype, inttype=inttype)
     end
 end
 
diff --git a/test/parser/inttype.jl b/test/parser/inttype.jl
new file mode 100644
index 0000000..30e9ca1
--- /dev/null
+++ b/test/parser/inttype.jl
@@ -0,0 +1,16 @@
+@testset for T in [Int32, Int64, Int128, BigInt]
+    val = JSON.parse("{\"x\": 3}", inttype=T)
+    @test isa(val, Dict{String, Any})
+    @test length(val) == 1
+    key = collect(keys(val))[1]
+    @test string(key) == "x"
+    value = val[key]
+    @test value == 3
+    @test typeof(value) == T
+end
+
+@testset begin
+    teststr = """{"201736327611975630": 18005722827070440994}"""
+    val = JSON.parse(teststr, inttype=Int128)
+    @test val == Dict{String,Any}("201736327611975630"=> 18005722827070440994)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 15d6b75..e27310c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -20,6 +20,10 @@ include("json-samples.jl")
         include("parser/dicttype.jl")
     end
 
+    @testset "inttype" begin
+        include("parser/inttype.jl")
+    end
+
     @testset "Miscellaneous" begin
         # test for single values
         @test JSON.parse("true") == true