From 97c5fd8bd0e86f737c7a503da4f166302fefaa4d Mon Sep 17 00:00:00 2001 From: Sam O'Connor Date: Tue, 23 Jan 2018 11:04:25 +1100 Subject: [PATCH] Faster URI parsing per #151 - Remove args -> string -> parse -> URI round-trip from constructors & merge() - Use parse_uri_reference() instead of slower http_parser_parse_url() --- src/URIs.jl | 163 ++++++++++++++++++++++++++++++------------ test/uri.jl | 8 +-- test/uri_benchmark.jl | 2 +- test/url.jl | 2 +- 4 files changed, 122 insertions(+), 53 deletions(-) diff --git a/src/URIs.jl b/src/URIs.jl index b109c04ae..395372e0b 100644 --- a/src/URIs.jl +++ b/src/URIs.jl @@ -1,5 +1,9 @@ module URIs +export URI, + resource, queryparams, absuri, + escapeuri, unescapeuri, escapepath + import Base.== import ..@require, ..precondition_error @@ -9,10 +13,6 @@ import ..compat_search include("urlparser.jl") -export URI, - resource, queryparams, absuri, - escapeuri, unescapeuri, escapepath - """ HTTP.URI(; scheme="", host="", port="", etc...) @@ -63,6 +63,8 @@ end)() URI(;kw...) = merge(emptyuri; kw...) +const nostring = "" + function Base.merge(uri::URI; scheme::AbstractString=uri.scheme, userinfo::AbstractString=uri.userinfo, host::AbstractString=uri.host, @@ -80,19 +82,7 @@ function Base.merge(uri::URI; scheme::AbstractString=uri.scheme, ports = string(port) querys = query isa String ? query : escapeuri(query) - str = uristring(scheme, userinfo, host, ports, path, querys, fragment) - result = parse(URI, str) - - if uri === emptyuri - @ensure result.scheme == scheme - @ensure result.userinfo == userinfo - @ensure result.host == host - @ensure result.port == ports - @ensure result.path == path - @ensure result.query == querys - end - - return result + return URI(nostring, scheme, userinfo, host, ports, path, querys, fragment) end @@ -100,47 +90,119 @@ end # https://tools.ietf.org/html/rfc3986#appendix-B const uri_reference_regex = r"""^ - (?: ([^:/?#]+) :) ? # 1. sheme - (?: // (?: ([^/?#@]*) @) ? # 2. userinfo - (?| (?: \[ ([^\]]+) \] ) # 3. host (ipv6) - | ([^:/?#\[]*) ) # 3. host - (?: : ([^/?#]+) ) ? ) ? # 4. port - ([^?#]*) # 5. path - (?: \?([^#]*) ) ? # 6. query - (?: [#](.*) ) ? # 7. fragment + (?: ([^:/?#]+) :) ? # 1. scheme + (?: // (?: ([^/?#@]*) @) ? # 2. userinfo + (?| (?: \[ ([^:\]]*:[^\]]*) \] ) # 3. host (ipv6) + | ([^:/?#\[]*) ) # 3. host + (?: : ([^/?#]*) ) ? ) ? # 4. port + ([^?#]*) # 5. path + (?: \?([^#]*) ) ? # 6. query + (?: [#](.*) ) ? # 7. fragment $"""x -const empty = SubString("", 1, 0) +const absent = SubString("", 1, 0) + + +""" +https://tools.ietf.org/html/rfc3986#section-3 +""" + +function parse_uri(str::AbstractString; kw...) + uri = parse_uri_reference(str; kw...) + if isempty(uri.scheme) + throw(URLParsingError("URI without scheme: $str")) + end + return uri +end -function regex_parse(::Type{URI}, str::AbstractString) + +""" +https://tools.ietf.org/html/rfc3986#section-4.1 +""" + +function parse_uri_reference(str::AbstractString; strict = false) m = match(uri_reference_regex, str) if m == nothing - return emptyuri + throw(URLParsingError("URI contains invalid character")) + end + uri = URI(str, (c = m[1]) == nothing ? absent : c, + (c = m[2]) == nothing ? absent : c, + (c = m[3]) == nothing ? absent : c, + (c = m[4]) == nothing ? absent : c, + (c = m[5]) == nothing ? absent : c, + (c = m[6]) == nothing ? absent : c, + (c = m[7]) == nothing ? absent : c) + + if strict + ensurevalid(uri) + @ensure uristring(uri) == str end - return URI(str, (c = m[1]) == nothing ? empty : c, - (c = m[2]) == nothing ? empty : c, - (c = m[3]) == nothing ? empty : c, - (c = m[4]) == nothing ? empty : c, - (c = m[5]) == nothing ? empty : c, - (c = m[6]) == nothing ? empty : c, - (c = m[7]) == nothing ? empty : c) + return uri end -URI(str::AbstractString) = Base.parse(URI, str) -function Base.parse(::Type{URI}, str::AbstractString) +URI(str::AbstractString) = parse_uri_reference(str) - uri = http_parser_parse_url(str) +Base.parse(::Type{URI}, str::AbstractString) = parse_uri_reference(str) - #showparts(STDOUT, regex_parse(URI, str)) - #showparts(STDOUT, uri) - @ensure regex_parse(URI, str) == uri - @ensure uristring(uri) == str - return uri + +function ensurevalid(uri::URI) + + # https://tools.ietf.org/html/rfc3986#section-3.1 + # ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + if !(uri.scheme === absent || + ismatch(r"^[[:alpha:]][[:alnum:]+-.]*$", uri.scheme)) + throw(URLParsingError("Invalid URI scheme: $(uri.scheme)")) + end + # https://tools.ietf.org/html/rfc3986#section-3.2.2 + # unreserved / pct-encoded / sub-delims + if !(uri.host === absent || + ismatch(r"^[:[:alnum:]\-._~%!$&'()*+,;=]+$", uri.host)) + throw(URLParsingError("Invalid URI host: $(uri.host) $uri")) + end + # https://tools.ietf.org/html/rfc3986#section-3.2.3 + # "port number in decimal" + if !(uri.port === absent || ismatch(r"^\d+$", uri.port)) + throw(URLParsingError("Invalid URI port: $(uri.port)")) + end + + # https://tools.ietf.org/html/rfc3986#section-3.3 + # unreserved / pct-encoded / sub-delims / ":" / "@" + if !(uri.path === absent || + ismatch(r"^[/[:alnum:]\-._~%!$&'()*+,;=:@]*$", uri.path)) + throw(URLParsingError("Invalid URI path: $(uri.path)")) + end + + # FIXME + # For compatibility with existing test/uri.jl + if !(uri.host === absent) && + (contains(uri.host, "=") || + contains(uri.host, ";") || + contains(uri.host, "%")) + throw(URLParsingError("Invalid URI host: $(uri.host)")) + end end +""" +https://tools.ietf.org/html/rfc3986#section-4.3 +""" + +isabsolute(uri::URI) = + !isempty(uri.scheme) && + isempty(uri.fragment) && + (isempty(uri.host) || isempty(uri.path) || pathissabsolute(uri)) + + +""" +https://tools.ietf.org/html/rfc7230#section-5.3.1 +https://tools.ietf.org/html/rfc3986#section-3.3 +""" + +pathissabsolute(uri::URI) = startwith(uri.path, "/") + + ==(a::URI,b::URI) = a.scheme == b.scheme && a.host == b.host && normalport(a) == normalport(b) && @@ -149,7 +211,11 @@ end a.fragment == b.fragment && a.userinfo == b.userinfo -# "request-target" per https://tools.ietf.org/html/rfc7230#section-5.3 + +""" +"request-target" per https://tools.ietf.org/html/rfc7230#section-5.3 +""" + resource(uri::URI) = string( isempty(uri.path) ? "/" : uri.path, !isempty(uri.query) ? "?" : "", uri.query, !isempty(uri.fragment) ? "#" : "", uri.fragment) @@ -172,11 +238,14 @@ showparts(io::IO, uri::URI) = " query = \"", uri.query, "\",\n", " fragment = \"", uri.fragment, "\")\n") +showparts(uri::URI) = showparts(STDOUT, uri) + Base.print(io::IO, u::URI) = print(io, u.uri) -Base.string(u::URI) = u.uri +Base.string(u::URI) = u.uri === nostring ? uristring(u) : u.uri -isabsent(ui) = isempty(ui) && !(ui === blank) +#isabsent(ui) = isempty(ui) && !(ui === blank) +isabsent(ui) = ui === absent function formaturi(io::IO, scheme::AbstractString, @@ -191,7 +260,7 @@ function formaturi(io::IO, ":" : "://") isabsent(userinfo) || print(io, userinfo, "@") isempty(host) || print(io, hoststring(host)) - isempty(port) || print(io, ":", port) + isabsent(port) || print(io, ":", port) isempty(path) || print(io, path) isabsent(query) || print(io, "?", query) isabsent(fragment) || print(io, "#", fragment) diff --git a/test/uri.jl b/test/uri.jl index 883a740c7..c8f8bbec2 100644 --- a/test/uri.jl +++ b/test/uri.jl @@ -95,11 +95,11 @@ end # Error paths # Non-ASCII characters - @test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, "http://🍕.com") + @test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri("http://🍕.com", strict=true) # Unexpected start of URL - @test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, ".google.com") + @test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri(".google.com", strict=true) # Unexpected character after scheme - @test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, "ht!tp://google.com") + @test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri("ht!tp://google.com", strict=true) # Issue #27 @test HTTP.escapeuri("t est\n") == "t%20est%0A" @@ -453,7 +453,7 @@ end @test port == u.expecteduri.port end elseif u.shouldthrow - @test_throws HTTP.URIs.URLParsingError parse(HTTP.URI, u.url) + @test_throws HTTP.URIs.URLParsingError HTTP.URIs.parse_uri_reference(u.url, strict=true) else url = parse(HTTP.URI, u.url) @test u.expecteduri == url diff --git a/test/uri_benchmark.jl b/test/uri_benchmark.jl index 0e67ad9d9..915b3f4ec 100644 --- a/test/uri_benchmark.jl +++ b/test/uri_benchmark.jl @@ -27,7 +27,7 @@ function go(count::Int) t_start = time() @time for rep in 1:count for url in urls - uri = HTTP.URIs.regex_parse(HTTP.URI, url) + uri = HTTP.URIs.parse_uri_reference(url) end end t_done = time() diff --git a/test/url.jl b/test/url.jl index 99e69dc0b..6f93fc65a 100644 --- a/test/url.jl +++ b/test/url.jl @@ -24,7 +24,7 @@ for group in tests url = test["url"] uri = nothing try - uri = parse(HTTP.URI, url) + uri = HTTP.URIs.parse_uri_reference(url; strict=true) catch e if e isa HTTP.URIs.URLParsingError println(e)