From: Russ Cox Date: Fri, 4 Dec 2015 17:15:30 +0000 (-0500) Subject: net/url: accept non-ASCII bytes in URL per RFC 3986 X-Git-Tag: go1.6beta1~167 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=a6869d1c5c5172dda7fdbea5e6b6ffbd2f997db1;p=gostls13.git net/url: accept non-ASCII bytes in URL per RFC 3986 Fixes #7991. Fixes #12719. Change-Id: I5650fa35ec5d49addeda6cc6e7fa93cfbe1cdfc0 Reviewed-on: https://go-review.googlesource.com/17385 Reviewed-by: Brad Fitzpatrick Run-TryBot: Brad Fitzpatrick --- diff --git a/src/net/url/url.go b/src/net/url/url.go index e7c08b348d..510ac77ede 100644 --- a/src/net/url/url.go +++ b/src/net/url/url.go @@ -71,6 +71,7 @@ type encoding int const ( encodePath encoding = 1 + iota encodeHost + encodeZone encodeUserPassword encodeQueryComponent encodeFragment @@ -93,7 +94,7 @@ func shouldEscape(c byte, mode encoding) bool { return false } - if mode == encodeHost { + if mode == encodeHost || mode == encodeZone { // §3.2.2 Host allows // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // as part of reg-name. @@ -166,6 +167,27 @@ func unescape(s string, mode encoding) (string, error) { } return "", EscapeError(s) } + // Per https://tools.ietf.org/html/rfc3986#page-21 + // in the host component %-encoding can only be used + // for non-ASCII bytes. + // But https://tools.ietf.org/html/rfc6874#section-2 + // introduces %25 being allowed to escape a percent sign + // in IPv6 scoped-address literals. Yay. + if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" { + return "", EscapeError(s[i : i+3]) + } + if mode == encodeZone { + // RFC 6874 says basically "anything goes" for zone identifiers + // and that even non-ASCII can be redundantly escaped, + // but it seems prudent to restrict %-escaped bytes here to those + // that are valid host name bytes in their unescaped form. + // That is, you can use escaping in the zone identifier but not + // to introduce bytes you couldn't just write directly. + v := unhex(s[i+1])<<4 | unhex(s[i+2]) + if s[i:i+3] != "%25" && shouldEscape(v, encodeHost) { + return "", EscapeError(s[i : i+3]) + } + } i += 3 case '+': hasPlus = mode == encodeQueryComponent @@ -496,14 +518,9 @@ func parseAuthority(authority string) (user *Userinfo, host string, err error) { // parseHost parses host as an authority without user // information. That is, as host[:port]. func parseHost(host string) (string, error) { - litOrName := host if strings.HasPrefix(host, "[") { // Parse an IP-Literal in RFC 3986 and RFC 6874. - // E.g., "[fe80::1], "[fe80::1%25en0]" - // - // RFC 4007 defines "%" as a delimiter character in - // the textual representation of IPv6 addresses. - // Per RFC 6874, in URIs that "%" is encoded as "%25". + // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80". i := strings.LastIndex(host, "]") if i < 0 { return "", errors.New("missing ']' in host") @@ -512,29 +529,31 @@ func parseHost(host string) (string, error) { if !validOptionalPort(colonPort) { return "", fmt.Errorf("invalid port %q after host", colonPort) } - // Parse a host subcomponent without a ZoneID in RFC - // 6874 because the ZoneID is allowed to use the - // percent encoded form. - j := strings.Index(host[:i], "%25") - if j < 0 { - litOrName = host[1:i] - } else { - litOrName = host[1:j] + + // RFC 6874 defines that %25 (%-encoded percent) introduces + // the zone identifier, and the zone identifier can use basically + // any %-encoding it likes. That's different from the host, which + // can only %-encode non-ASCII bytes. + // We do impose some restrictions on the zone, to avoid stupidity + // like newlines. + zone := strings.Index(host[:i], "%25") + if zone >= 0 { + host1, err := unescape(host[:zone], encodeHost) + if err != nil { + return "", err + } + host2, err := unescape(host[zone:i], encodeZone) + if err != nil { + return "", err + } + host3, err := unescape(host[i:], encodeHost) + if err != nil { + return "", err + } + return host1 + host2 + host3, nil } } - // A URI containing an IP-Literal without a ZoneID or - // IPv4address in RFC 3986 and RFC 6847 must not be - // percent-encoded. - // - // A URI containing a DNS registered name in RFC 3986 is - // allowed to be percent-encoded, though we don't use it for - // now to avoid messing up with the gap between allowed - // characters in URI and allowed characters in DNS. - // See golang.org/issue/7991. - if strings.Contains(litOrName, "%") { - return "", errors.New("percent-encoded characters in host") - } var err error if host, err = unescape(host, encodeHost); err != nil { return "", err diff --git a/src/net/url/url_test.go b/src/net/url/url_test.go index 037e8549ad..da022f20d4 100644 --- a/src/net/url/url_test.go +++ b/src/net/url/url_test.go @@ -483,6 +483,34 @@ var urltests = []URLTest{ }, "", }, + // golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host) + { + "http://hello.世界.com/foo", + &URL{ + Scheme: "http", + Host: "hello.世界.com", + Path: "/foo", + }, + "http://hello.%E4%B8%96%E7%95%8C.com/foo", + }, + { + "http://hello.%e4%b8%96%e7%95%8c.com/foo", + &URL{ + Scheme: "http", + Host: "hello.世界.com", + Path: "/foo", + }, + "http://hello.%E4%B8%96%E7%95%8C.com/foo", + }, + { + "http://hello.%E4%B8%96%E7%95%8C.com/foo", + &URL{ + Scheme: "http", + Host: "hello.世界.com", + Path: "/foo", + }, + "", + }, } // more useful string for debugging than fmt's struct printer @@ -1184,11 +1212,11 @@ func TestParseAuthority(t *testing.T) { {"http://[::1]%23", true}, {"http://[::1%25en0]", false}, // valid zone id {"http://[::1]:", false}, // colon, but no port OK - {"http://[::1]:%38%30", true}, // no hex in port - {"http://[::1%25%10]", false}, // TODO: reject the %10 after the valid zone %25 separator? + {"http://[::1]:%38%30", true}, // not allowed: % encoding only for non-ASCII + {"http://[::1%25%41]", false}, // RFC 6874 allows over-escaping in zone {"http://[%10::1]", true}, // no %xx escapes in IP address {"http://[::1]/%48", false}, // %xx in path is fine - {"http://%41:8080/", true}, // TODO: arguably we should accept reg-name with %xx + {"http://%41:8080/", true}, // not allowed: % encoding only for non-ASCII {"mysql://x@y(z:123)/foo", false}, // golang.org/issue/12023 {"mysql://x@y(1.2.3.4:123)/foo", false}, {"mysql://x@y([2001:db8::1]:123)/foo", false},