]> Cypherpunks repositories - gostls13.git/commitdiff
net/url: accept non-ASCII bytes in URL per RFC 3986
authorRuss Cox <rsc@golang.org>
Fri, 4 Dec 2015 17:15:30 +0000 (12:15 -0500)
committerBrad Fitzpatrick <bradfitz@golang.org>
Mon, 7 Dec 2015 14:01:56 +0000 (14:01 +0000)
Fixes #7991.
Fixes #12719.

Change-Id: I5650fa35ec5d49addeda6cc6e7fa93cfbe1cdfc0
Reviewed-on: https://go-review.googlesource.com/17385
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>

src/net/url/url.go
src/net/url/url_test.go

index e7c08b348dc3dd1e4974491c608a2f2ac60adb42..510ac77ede5c224d1c64357c3944166d856469a7 100644 (file)
@@ -71,6 +71,7 @@ type encoding int
 const (
        encodePath encoding = 1 + iota
        encodeHost
+       encodeZone
        encodeUserPassword
        encodeQueryComponent
        encodeFragment
@@ -93,7 +94,7 @@ func shouldEscape(c byte, mode encoding) bool {
                return false
        }
 
-       if mode == encodeHost {
+       if mode == encodeHost || mode == encodeZone {
                // §3.2.2 Host allows
                //      sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
                // as part of reg-name.
@@ -166,6 +167,27 @@ func unescape(s string, mode encoding) (string, error) {
                                }
                                return "", EscapeError(s)
                        }
+                       // Per https://tools.ietf.org/html/rfc3986#page-21
+                       // in the host component %-encoding can only be used
+                       // for non-ASCII bytes.
+                       // But https://tools.ietf.org/html/rfc6874#section-2
+                       // introduces %25 being allowed to escape a percent sign
+                       // in IPv6 scoped-address literals. Yay.
+                       if mode == encodeHost && unhex(s[i+1]) < 8 && s[i:i+3] != "%25" {
+                               return "", EscapeError(s[i : i+3])
+                       }
+                       if mode == encodeZone {
+                               // RFC 6874 says basically "anything goes" for zone identifiers
+                               // and that even non-ASCII can be redundantly escaped,
+                               // but it seems prudent to restrict %-escaped bytes here to those
+                               // that are valid host name bytes in their unescaped form.
+                               // That is, you can use escaping in the zone identifier but not
+                               // to introduce bytes you couldn't just write directly.
+                               v := unhex(s[i+1])<<4 | unhex(s[i+2])
+                               if s[i:i+3] != "%25" && shouldEscape(v, encodeHost) {
+                                       return "", EscapeError(s[i : i+3])
+                               }
+                       }
                        i += 3
                case '+':
                        hasPlus = mode == encodeQueryComponent
@@ -496,14 +518,9 @@ func parseAuthority(authority string) (user *Userinfo, host string, err error) {
 // parseHost parses host as an authority without user
 // information. That is, as host[:port].
 func parseHost(host string) (string, error) {
-       litOrName := host
        if strings.HasPrefix(host, "[") {
                // Parse an IP-Literal in RFC 3986 and RFC 6874.
-               // E.g., "[fe80::1], "[fe80::1%25en0]"
-               //
-               // RFC 4007 defines "%" as a delimiter character in
-               // the textual representation of IPv6 addresses.
-               // Per RFC 6874, in URIs that "%" is encoded as "%25".
+               // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
                i := strings.LastIndex(host, "]")
                if i < 0 {
                        return "", errors.New("missing ']' in host")
@@ -512,29 +529,31 @@ func parseHost(host string) (string, error) {
                if !validOptionalPort(colonPort) {
                        return "", fmt.Errorf("invalid port %q after host", colonPort)
                }
-               // Parse a host subcomponent without a ZoneID in RFC
-               // 6874 because the ZoneID is allowed to use the
-               // percent encoded form.
-               j := strings.Index(host[:i], "%25")
-               if j < 0 {
-                       litOrName = host[1:i]
-               } else {
-                       litOrName = host[1:j]
+
+               // RFC 6874 defines that %25 (%-encoded percent) introduces
+               // the zone identifier, and the zone identifier can use basically
+               // any %-encoding it likes. That's different from the host, which
+               // can only %-encode non-ASCII bytes.
+               // We do impose some restrictions on the zone, to avoid stupidity
+               // like newlines.
+               zone := strings.Index(host[:i], "%25")
+               if zone >= 0 {
+                       host1, err := unescape(host[:zone], encodeHost)
+                       if err != nil {
+                               return "", err
+                       }
+                       host2, err := unescape(host[zone:i], encodeZone)
+                       if err != nil {
+                               return "", err
+                       }
+                       host3, err := unescape(host[i:], encodeHost)
+                       if err != nil {
+                               return "", err
+                       }
+                       return host1 + host2 + host3, nil
                }
        }
 
-       // A URI containing an IP-Literal without a ZoneID or
-       // IPv4address in RFC 3986 and RFC 6847 must not be
-       // percent-encoded.
-       //
-       // A URI containing a DNS registered name in RFC 3986 is
-       // allowed to be percent-encoded, though we don't use it for
-       // now to avoid messing up with the gap between allowed
-       // characters in URI and allowed characters in DNS.
-       // See golang.org/issue/7991.
-       if strings.Contains(litOrName, "%") {
-               return "", errors.New("percent-encoded characters in host")
-       }
        var err error
        if host, err = unescape(host, encodeHost); err != nil {
                return "", err
index 037e8549ad01650ce33628c8c35567007084601c..da022f20d412141dc1b97b89b6895e2ad1b116c4 100644 (file)
@@ -483,6 +483,34 @@ var urltests = []URLTest{
                },
                "",
        },
+       // golang.org/issue/7991 and golang.org/issue/12719 (non-ascii %-encoded in host)
+       {
+               "http://hello.世界.com/foo",
+               &URL{
+                       Scheme: "http",
+                       Host:   "hello.世界.com",
+                       Path:   "/foo",
+               },
+               "http://hello.%E4%B8%96%E7%95%8C.com/foo",
+       },
+       {
+               "http://hello.%e4%b8%96%e7%95%8c.com/foo",
+               &URL{
+                       Scheme: "http",
+                       Host:   "hello.世界.com",
+                       Path:   "/foo",
+               },
+               "http://hello.%E4%B8%96%E7%95%8C.com/foo",
+       },
+       {
+               "http://hello.%E4%B8%96%E7%95%8C.com/foo",
+               &URL{
+                       Scheme: "http",
+                       Host:   "hello.世界.com",
+                       Path:   "/foo",
+               },
+               "",
+       },
 }
 
 // more useful string for debugging than fmt's struct printer
@@ -1184,11 +1212,11 @@ func TestParseAuthority(t *testing.T) {
                {"http://[::1]%23", true},
                {"http://[::1%25en0]", false},     // valid zone id
                {"http://[::1]:", false},          // colon, but no port OK
-               {"http://[::1]:%38%30", true},     // no hex in port
-               {"http://[::1%25%10]", false},     // TODO: reject the %10 after the valid zone %25 separator?
+               {"http://[::1]:%38%30", true},     // not allowed: % encoding only for non-ASCII
+               {"http://[::1%25%41]", false},     // RFC 6874 allows over-escaping in zone
                {"http://[%10::1]", true},         // no %xx escapes in IP address
                {"http://[::1]/%48", false},       // %xx in path is fine
-               {"http://%41:8080/", true},        // TODO: arguably we should accept reg-name with %xx
+               {"http://%41:8080/", true},        // not allowed: % encoding only for non-ASCII
                {"mysql://x@y(z:123)/foo", false}, // golang.org/issue/12023
                {"mysql://x@y(1.2.3.4:123)/foo", false},
                {"mysql://x@y([2001:db8::1]:123)/foo", false},