Skip to content

Commit

Permalink
Faster URI parsing per #151
Browse files Browse the repository at this point in the history
 - Remove args -> string -> parse -> URI round-trip from constructors & merge()
 - Use parse_uri_reference() instead of slower http_parser_parse_url()
  • Loading branch information
samoconnor committed Jan 23, 2018
1 parent 91edbff commit 97c5fd8
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 53 deletions.
163 changes: 116 additions & 47 deletions src/URIs.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
module URIs

export URI,
resource, queryparams, absuri,
escapeuri, unescapeuri, escapepath

import Base.==

import ..@require, ..precondition_error
Expand All @@ -9,10 +13,6 @@ import ..compat_search

include("urlparser.jl")

export URI,
resource, queryparams, absuri,
escapeuri, unescapeuri, escapepath


"""
HTTP.URI(; scheme="", host="", port="", etc...)
Expand Down Expand Up @@ -63,6 +63,8 @@ end)()

URI(;kw...) = merge(emptyuri; kw...)

const nostring = ""

function Base.merge(uri::URI; scheme::AbstractString=uri.scheme,
userinfo::AbstractString=uri.userinfo,
host::AbstractString=uri.host,
Expand All @@ -80,67 +82,127 @@ function Base.merge(uri::URI; scheme::AbstractString=uri.scheme,
ports = string(port)
querys = query isa String ? query : escapeuri(query)

str = uristring(scheme, userinfo, host, ports, path, querys, fragment)
result = parse(URI, str)

if uri === emptyuri
@ensure result.scheme == scheme
@ensure result.userinfo == userinfo
@ensure result.host == host
@ensure result.port == ports
@ensure result.path == path
@ensure result.query == querys
end

return result
return URI(nostring, scheme, userinfo, host, ports, path, querys, fragment)
end


# Based on regex from RFC 3986:
# https://tools.ietf.org/html/rfc3986#appendix-B
const uri_reference_regex =
r"""^
(?: ([^:/?#]+) :) ? # 1. sheme
(?: // (?: ([^/?#@]*) @) ? # 2. userinfo
(?| (?: \[ ([^\]]+) \] ) # 3. host (ipv6)
| ([^:/?#\[]*) ) # 3. host
(?: : ([^/?#]+) ) ? ) ? # 4. port
([^?#]*) # 5. path
(?: \?([^#]*) ) ? # 6. query
(?: [#](.*) ) ? # 7. fragment
(?: ([^:/?#]+) :) ? # 1. scheme
(?: // (?: ([^/?#@]*) @) ? # 2. userinfo
(?| (?: \[ ([^:\]]*:[^\]]*) \] ) # 3. host (ipv6)
| ([^:/?#\[]*) ) # 3. host
(?: : ([^/?#]*) ) ? ) ? # 4. port
([^?#]*) # 5. path
(?: \?([^#]*) ) ? # 6. query
(?: [#](.*) ) ? # 7. fragment
$"""x

const empty = SubString("", 1, 0)
const absent = SubString("", 1, 0)


"""
https://tools.ietf.org/html/rfc3986#section-3
"""

function parse_uri(str::AbstractString; kw...)
uri = parse_uri_reference(str; kw...)
if isempty(uri.scheme)
throw(URLParsingError("URI without scheme: $str"))
end
return uri
end

function regex_parse(::Type{URI}, str::AbstractString)

"""
https://tools.ietf.org/html/rfc3986#section-4.1
"""

function parse_uri_reference(str::AbstractString; strict = false)

m = match(uri_reference_regex, str)
if m == nothing
return emptyuri
throw(URLParsingError("URI contains invalid character"))
end
uri = URI(str, (c = m[1]) == nothing ? absent : c,
(c = m[2]) == nothing ? absent : c,
(c = m[3]) == nothing ? absent : c,
(c = m[4]) == nothing ? absent : c,
(c = m[5]) == nothing ? absent : c,
(c = m[6]) == nothing ? absent : c,
(c = m[7]) == nothing ? absent : c)

if strict
ensurevalid(uri)
@ensure uristring(uri) == str
end
return URI(str, (c = m[1]) == nothing ? empty : c,
(c = m[2]) == nothing ? empty : c,
(c = m[3]) == nothing ? empty : c,
(c = m[4]) == nothing ? empty : c,
(c = m[5]) == nothing ? empty : c,
(c = m[6]) == nothing ? empty : c,
(c = m[7]) == nothing ? empty : c)
return uri
end

URI(str::AbstractString) = Base.parse(URI, str)

function Base.parse(::Type{URI}, str::AbstractString)
URI(str::AbstractString) = parse_uri_reference(str)

uri = http_parser_parse_url(str)
Base.parse(::Type{URI}, str::AbstractString) = parse_uri_reference(str)

#showparts(STDOUT, regex_parse(URI, str))
#showparts(STDOUT, uri)
@ensure regex_parse(URI, str) == uri
@ensure uristring(uri) == str
return uri

function ensurevalid(uri::URI)

# https://tools.ietf.org/html/rfc3986#section-3.1
# ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
if !(uri.scheme === absent ||
ismatch(r"^[[:alpha:]][[:alnum:]+-.]*$", uri.scheme))
throw(URLParsingError("Invalid URI scheme: $(uri.scheme)"))
end
# https://tools.ietf.org/html/rfc3986#section-3.2.2
# unreserved / pct-encoded / sub-delims
if !(uri.host === absent ||
ismatch(r"^[:[:alnum:]\-._~%!$&'()*+,;=]+$", uri.host))
throw(URLParsingError("Invalid URI host: $(uri.host) $uri"))
end
# https://tools.ietf.org/html/rfc3986#section-3.2.3
# "port number in decimal"
if !(uri.port === absent || ismatch(r"^\d+$", uri.port))
throw(URLParsingError("Invalid URI port: $(uri.port)"))
end

# https://tools.ietf.org/html/rfc3986#section-3.3
# unreserved / pct-encoded / sub-delims / ":" / "@"
if !(uri.path === absent ||
ismatch(r"^[/[:alnum:]\-._~%!$&'()*+,;=:@]*$", uri.path))
throw(URLParsingError("Invalid URI path: $(uri.path)"))
end

# FIXME
# For compatibility with existing test/uri.jl
if !(uri.host === absent) &&
(contains(uri.host, "=") ||
contains(uri.host, ";") ||
contains(uri.host, "%"))
throw(URLParsingError("Invalid URI host: $(uri.host)"))
end
end


"""
https://tools.ietf.org/html/rfc3986#section-4.3
"""

isabsolute(uri::URI) =
!isempty(uri.scheme) &&
isempty(uri.fragment) &&
(isempty(uri.host) || isempty(uri.path) || pathissabsolute(uri))


"""
https://tools.ietf.org/html/rfc7230#section-5.3.1
https://tools.ietf.org/html/rfc3986#section-3.3
"""

pathissabsolute(uri::URI) = startwith(uri.path, "/")


==(a::URI,b::URI) = a.scheme == b.scheme &&
a.host == b.host &&
normalport(a) == normalport(b) &&
Expand All @@ -149,7 +211,11 @@ end
a.fragment == b.fragment &&
a.userinfo == b.userinfo

# "request-target" per https://tools.ietf.org/html/rfc7230#section-5.3

"""
"request-target" per https://tools.ietf.org/html/rfc7230#section-5.3
"""

resource(uri::URI) = string( isempty(uri.path) ? "/" : uri.path,
!isempty(uri.query) ? "?" : "", uri.query,
!isempty(uri.fragment) ? "#" : "", uri.fragment)
Expand All @@ -172,11 +238,14 @@ showparts(io::IO, uri::URI) =
" query = \"", uri.query, "\",\n",
" fragment = \"", uri.fragment, "\")\n")

showparts(uri::URI) = showparts(STDOUT, uri)

Base.print(io::IO, u::URI) = print(io, u.uri)

Base.string(u::URI) = u.uri
Base.string(u::URI) = u.uri === nostring ? uristring(u) : u.uri

isabsent(ui) = isempty(ui) && !(ui === blank)
#isabsent(ui) = isempty(ui) && !(ui === blank)
isabsent(ui) = ui === absent

function formaturi(io::IO,
scheme::AbstractString,
Expand All @@ -191,7 +260,7 @@ function formaturi(io::IO,
":" : "://")
isabsent(userinfo) || print(io, userinfo, "@")
isempty(host) || print(io, hoststring(host))
isempty(port) || print(io, ":", port)
isabsent(port) || print(io, ":", port)
isempty(path) || print(io, path)
isabsent(query) || print(io, "?", query)
isabsent(fragment) || print(io, "#", fragment)
Expand Down
8 changes: 4 additions & 4 deletions test/uri.jl
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@ end

# Error paths
# Non-ASCII characters
@test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, "http://🍕.com")
@test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri("http://🍕.com", strict=true)
# Unexpected start of URL
@test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, ".google.com")
@test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri(".google.com", strict=true)
# Unexpected character after scheme
@test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, "ht!tp://google.com")
@test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri("ht!tp://google.com", strict=true)

# Issue #27
@test HTTP.escapeuri("t est\n") == "t%20est%0A"
Expand Down Expand Up @@ -453,7 +453,7 @@ end
@test port == u.expecteduri.port
end
elseif u.shouldthrow
@test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, u.url)
@test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri_reference(u.url, strict=true)
else
url = parse(HTTP.URI, u.url)
@test u.expecteduri == url
Expand Down
2 changes: 1 addition & 1 deletion test/uri_benchmark.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ function go(count::Int)
t_start = time()
@time for rep in 1:count
for url in urls
uri = HTTP.URIs.regex_parse(HTTP.URI, url)
uri = HTTP.URIs.parse_uri_reference(url)
end
end
t_done = time()
Expand Down
2 changes: 1 addition & 1 deletion test/url.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ for group in tests
url = test["url"]
uri = nothing
try
uri = parse(HTTP.URI, url)
uri = HTTP.URIs.parse_uri_reference(url; strict=true)
catch e
if e isa HTTP.URIs.URLParsingError
println(e)
Expand Down

0 comments on commit 97c5fd8

Please sign in to comment.