]> Cypherpunks repositories - gostls13.git/commitdiff
Fixes to URL functionality:
authorSteve Newman <devnull@localhost>
Tue, 2 Jun 2009 19:48:18 +0000 (12:48 -0700)
committerSteve Newman <devnull@localhost>
Tue, 2 Jun 2009 19:48:18 +0000 (12:48 -0700)
- Extend http.URLUnescape to convert '+' to space
- Add http.URLEscape
- Rename URL.Query to EncodedQuery (and stop decoding it, as decoding this field
  before separating key/value pairs loses important information)
- Report a clean error on incomplete hex escapes
- Update existing tests, add new ones

APPROVED=rsc
DELTA=293  (256 added, 3 deleted, 34 changed)
OCL=29685
CL=29759

src/lib/http/url.go
src/lib/http/url_test.go

index db51d99aa2a8eb7acd5f8ecc5c50a209d96df80c..0325b04eedd56cc3aecc3fa5b3284f3323d7dee9 100644 (file)
@@ -41,37 +41,60 @@ func unhex(c byte) byte {
        return 0
 }
 
+// Return true if the specified character should be escaped when appearing in a
+// URL string.
+//
+// TODO: for now, this is a hack; it only flags a few common characters that have
+// special meaning in URLs.  That will get the job done in the common cases.
+func shouldEscape(c byte) bool {
+       switch c {
+       case ' ', '?', '&', '=', '#', '+', '%':
+               return true;
+       }
+       return false;
+}
+
 // URLUnescape unescapes a URL-encoded string,
-// converting %AB into the byte 0xAB.
+// converting %AB into the byte 0xAB and '+' into ' ' (space).
 // It returns a BadURL error if any % is not followed
 // by two hexadecimal digits.
 func URLUnescape(s string) (string, os.Error) {
        // Count %, check that they're well-formed.
        n := 0;
+       anyPlusses := false;
        for i := 0; i < len(s); {
-               if s[i] == '%' {
+               switch s[i] {
+               case '%':
                        n++;
-                       if !ishex(s[i+1]) || !ishex(s[i+2]) {
+                       if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
                                return "", BadURL{"invalid hexadecimal escape"}
                        }
-                       i += 3
-               } else {
+                       i += 3;
+               case '+':
+                       anyPlusses = true;
+                       i++;
+               default:
                        i++
                }
        }
 
-       if n == 0 {
+       if n == 0 && !anyPlusses {
                return s, nil
        }
 
        t := make([]byte, len(s)-2*n);
        j := 0;
        for i := 0; i < len(s); {
-               if s[i] == '%' {
+               switch s[i] {
+               case '%':
                        t[j] = unhex(s[i+1]) << 4 | unhex(s[i+2]);
                        j++;
                        i += 3;
-               } else {
+               case '+':
+                       t[j] = ' ';
+                       j++;
+                       i++;
+               default:
                        t[j] = s[i];
                        j++;
                        i++;
@@ -80,9 +103,53 @@ func URLUnescape(s string) (string, os.Error) {
        return string(t), nil;
 }
 
+// URLEscape converts a string into URL-encoded form.
+func URLEscape(s string) string {
+       spaceCount, hexCount := 0, 0;
+       for i := 0; i < len(s); i++ {
+               c := s[i];
+               if (shouldEscape(c)) {
+                       if (c == ' ') {
+                               spaceCount++;
+                       } else {
+                               hexCount++;
+                       }
+               }
+       }
+
+       if spaceCount == 0 && hexCount == 0 {
+               return s;
+       }
+
+       t := make([]byte, len(s)+2*hexCount);
+       j := 0;
+       for i := 0; i < len(s); i++ {
+               c := s[i];
+               if !shouldEscape(c) {
+                       t[j] = s[i];
+                       j++;
+               } else if (c == ' ') {
+                       t[j] = '+';
+                       j++;
+               } else {
+                       t[j] = '%';
+                       t[j+1] = "0123456789abcdef"[c>>4];
+                       t[j+2] = "0123456789abcdef"[c&15];
+                       j += 3;
+               }
+       }
+       return string(t);
+}
+
 // A URL represents a parsed URL (technically, a URI reference).
 // The general form represented is:
 //     scheme://[userinfo@]host/path[?query][#fragment]
+// The Raw, RawPath, and RawQuery fields are in "wire format" (special
+// characters must be hex-escaped if not meant to have special meaning).
+// All other fields are logical values; '+' or '%' represent themselves.
+//
+// Note, the reason for using wire format for the query is that it needs
+// to be split into key/value pairs before decoding.
 type URL struct {
        Raw string;             // the original string
        Scheme string;          // scheme
@@ -91,7 +158,7 @@ type URL struct {
        Userinfo string;        // userinfo
        Host string;            // host
        Path string;            // /path
-       Query string;           // query
+       RawQuery string;        // query
        Fragment string;        // fragment
 }
 
@@ -156,10 +223,7 @@ func ParseURL(rawurl string) (url *URL, err os.Error) {
        // RFC 2396: a relative URI (no scheme) has a ?query,
        // but absolute URIs only have query if path begins with /
        if url.Scheme == "" || len(path) > 0 && path[0] == '/' {
-               path, url.Query = split(path, '?', true);
-               if url.Query, err = URLUnescape(url.Query); err != nil {
-                       return nil, err
-               }
+               path, url.RawQuery = split(path, '?', true);
        }
 
        // Maybe path is //authority/path
@@ -180,6 +244,21 @@ func ParseURL(rawurl string) (url *URL, err os.Error) {
                return nil, err
        }
 
+       // Remove escapes from the Authority and Userinfo fields, and verify
+       // that Scheme and Host contain no escapes (that would be illegal).
+       if url.Authority, err = URLUnescape(url.Authority); err != nil {
+               return nil, err
+       }
+       if url.Userinfo, err = URLUnescape(url.Userinfo); err != nil {
+               return nil, err
+       }
+       if (strings.Index(url.Scheme, "%") >= 0) {
+               return nil, BadURL{"hexadecimal escape in scheme"}
+       }
+       if (strings.Index(url.Host, "%") >= 0) {
+               return nil, BadURL{"hexadecimal escape in host"}
+       }
+
        return url, nil
 }
 
@@ -200,7 +279,7 @@ func ParseURLReference(rawurlref string) (url *URL, err os.Error) {
 //
 // There are redundant fields stored in the URL structure:
 // the String method consults Scheme, Path, Host, Userinfo,
-// Query, and Fragment, but not RawPath or Authority.
+// RawQuery, and Fragment, but not Raw, RawPath or Authority.
 func (url *URL) String() string {
        result := "";
        if url.Scheme != "" {
@@ -209,16 +288,16 @@ func (url *URL) String() string {
        if url.Host != "" || url.Userinfo != "" {
                result += "//";
                if url.Userinfo != "" {
-                       result += url.Userinfo + "@";
+                       result += URLEscape(url.Userinfo) + "@";
                }
                result += url.Host;
        }
-       result += url.Path;
-       if url.Query != "" {
-               result += "?" + url.Query;
+       result += URLEscape(url.Path);
+       if url.RawQuery != "" {
+               result += "?" + url.RawQuery;
        }
        if url.Fragment != "" {
-               result += "#" + url.Fragment;
+               result += "#" + URLEscape(url.Fragment);
        }
        return result;
 }
index f5a7069aeab911242fca9cc8a5afd9238bab75f6..8d8fabad5f903cf928dea9ac02db53d3656989d9 100644 (file)
@@ -20,6 +20,7 @@ import (
 type URLTest struct {
        in string;
        out *URL;
+       roundtrip string; // expected result of reserializing the URL; empty means same as "in".
 }
 
 var urltests = []URLTest {
@@ -31,7 +32,8 @@ var urltests = []URLTest {
                        "http", "//www.google.com",
                        "www.google.com", "", "www.google.com",
                        "", "", ""
-               }
+               },
+               ""
        },
        // path
        URLTest{
@@ -41,7 +43,19 @@ var urltests = []URLTest {
                        "http", "//www.google.com/",
                        "www.google.com", "", "www.google.com",
                        "/", "", ""
-               }
+               },
+               ""
+       },
+       // path with hex escaping... note that space roundtrips to +
+       URLTest{
+               "http://www.google.com/file%20one%26two",
+               &URL{
+                       "http://www.google.com/file%20one%26two",
+                       "http", "//www.google.com/file%20one%26two",
+                       "www.google.com", "", "www.google.com",
+                       "/file one&two", "", ""
+               },
+               "http://www.google.com/file+one%26two"
        },
        // user
        URLTest{
@@ -51,7 +65,19 @@ var urltests = []URLTest {
                        "ftp", "//webmaster@www.google.com/",
                        "webmaster@www.google.com", "webmaster", "www.google.com",
                        "/", "", ""
-               }
+               },
+               ""
+       },
+       // escape sequence in username
+       URLTest{
+               "ftp://john%20doe@www.google.com/",
+               &URL{
+                       "ftp://john%20doe@www.google.com/",
+                       "ftp", "//john%20doe@www.google.com/",
+                       "john doe@www.google.com", "john doe", "www.google.com",
+                       "/", "", ""
+               },
+               "ftp://john+doe@www.google.com/"
        },
        // query
        URLTest{
@@ -61,7 +87,19 @@ var urltests = []URLTest {
                        "http", "//www.google.com/?q=go+language",
                        "www.google.com", "", "www.google.com",
                        "/", "q=go+language", ""
-               }
+               },
+               ""
+       },
+       // query with hex escaping: NOT parsed
+       URLTest{
+               "http://www.google.com/?q=go%20language",
+               &URL{
+                       "http://www.google.com/?q=go%20language",
+                       "http", "//www.google.com/?q=go%20language",
+                       "www.google.com", "", "www.google.com",
+                       "/", "q=go%20language", ""
+               },
+               ""
        },
        // path without /, so no query parsing
        URLTest{
@@ -70,8 +108,9 @@ var urltests = []URLTest {
                        "http:www.google.com/?q=go+language",
                        "http", "www.google.com/?q=go+language",
                        "", "", "",
-                       "www.google.com/?q=go+language", "", ""
-               }
+                       "www.google.com/?q=go language", "", ""
+               },
+               "http:www.google.com/%3fq%3dgo+language"
        },
        // non-authority
        URLTest{
@@ -81,7 +120,8 @@ var urltests = []URLTest {
                        "mailto", "/webmaster@golang.org",
                        "", "", "",
                        "/webmaster@golang.org", "", ""
-               }
+               },
+               ""
        },
        // non-authority
        URLTest{
@@ -91,7 +131,8 @@ var urltests = []URLTest {
                        "mailto", "webmaster@golang.org",
                        "", "", "",
                        "webmaster@golang.org", "", ""
-               }
+               },
+               ""
        },
 }
 
@@ -103,7 +144,8 @@ var urlnofragtests = []URLTest {
                        "http", "//www.google.com/?q=go+language#foo",
                        "www.google.com", "", "www.google.com",
                        "/", "q=go+language#foo", ""
-               }
+               },
+               ""
        },
 }
 
@@ -115,7 +157,18 @@ var urlfragtests = []URLTest {
                        "http", "//www.google.com/?q=go+language",
                        "www.google.com", "", "www.google.com",
                        "/", "q=go+language", "foo"
-               }
+               },
+               ""
+       },
+       URLTest{
+               "http://www.google.com/?q=go+language#foo%26bar",
+               &URL{
+                       "http://www.google.com/?q=go+language",
+                       "http", "//www.google.com/?q=go+language",
+                       "www.google.com", "", "www.google.com",
+                       "/", "q=go+language", "foo&bar"
+               },
+               ""
        },
 }
 
@@ -123,7 +176,7 @@ var urlfragtests = []URLTest {
 func ufmt(u *URL) string {
        return fmt.Sprintf("%q, %q, %q, %q, %q, %q, %q, %q, %q",
                u.Raw, u.Scheme, u.RawPath, u.Authority, u.Userinfo,
-               u.Host, u.Path, u.Query, u.Fragment);
+               u.Host, u.Path, u.RawQuery, u.Fragment);
 }
 
 func DoTest(t *testing.T, parse func(string) (*URL, os.Error), name string, tests []URLTest) {
@@ -158,8 +211,12 @@ func DoTestString(t *testing.T, parse func(string) (*URL, os.Error), name string
                        continue;
                }
                s := u.String();
-               if s != tt.in {
-                       t.Errorf("%s(%q).String() == %q", tt.in, s);
+               expected := tt.in;
+               if len(tt.roundtrip) > 0 {
+                       expected = tt.roundtrip;
+               }
+               if s != expected {
+                       t.Errorf("%s(%q).String() == %q (expected %q)", name, tt.in, s, expected);
                }
        }
 }
@@ -172,3 +229,120 @@ func TestURLString(t *testing.T) {
        DoTestString(t, ParseURLReference, "ParseURLReference", urlfragtests);
        DoTestString(t, ParseURLReference, "ParseURLReference", urlnofragtests);
 }
+
+type URLEscapeTest struct {
+       in string;
+       out string;
+       err os.Error;
+}
+
+var unescapeTests = []URLEscapeTest {
+       URLEscapeTest{
+               "",
+               "",
+               nil
+       },
+       URLEscapeTest{
+               "abc",
+               "abc",
+               nil
+       },
+       URLEscapeTest{
+               "1%41",
+               "1A",
+               nil
+       },
+       URLEscapeTest{
+               "1%41%42%43",
+               "1ABC",
+               nil
+       },
+       URLEscapeTest{
+               "%4a",
+               "J",
+               nil
+       },
+       URLEscapeTest{
+               "%6F",
+               "o",
+               nil
+       },
+       URLEscapeTest{
+               "%", // not enough characters after %
+               "",
+               BadURL{"invalid hexadecimal escape"}
+       },
+       URLEscapeTest{
+               "%a", // not enough characters after %
+               "",
+               BadURL{"invalid hexadecimal escape"}
+       },
+       URLEscapeTest{
+               "%1", // not enough characters after %
+               "",
+               BadURL{"invalid hexadecimal escape"}
+       },
+       URLEscapeTest{
+               "123%45%6", // not enough characters after %
+               "",
+               BadURL{"invalid hexadecimal escape"}
+       },
+       URLEscapeTest{
+               "%zz", // invalid hex digits
+               "",
+               BadURL{"invalid hexadecimal escape"}
+       },
+}
+
+func TestURLUnescape(t *testing.T) {
+       for i, tt := range unescapeTests {
+               actual, err := URLUnescape(tt.in);
+               if actual != tt.out || (err != nil) != (tt.err != nil) {
+                       t.Errorf("URLUnescape(%q) = %q, %s; want %q, %s", tt.in, actual, err, tt.out, tt.err);
+               }
+       }
+}
+
+var escapeTests = []URLEscapeTest {
+       URLEscapeTest{
+               "",
+               "",
+               nil
+       },
+       URLEscapeTest{
+               "abc",
+               "abc",
+               nil
+       },
+       URLEscapeTest{
+               "one two",
+               "one+two",
+               nil
+       },
+       URLEscapeTest{
+               "10%",
+               "10%25",
+               nil
+       },
+       URLEscapeTest{
+               " ?&=#+%!",
+               "+%3f%26%3d%23%2b%25!",
+               nil
+       },
+}
+
+func TestURLEscape(t *testing.T) {
+       for i, tt := range escapeTests {
+               actual := URLEscape(tt.in);
+               if tt.out != actual {
+                       t.Errorf("URLEscape(%q) = %q, want %q", tt.in, actual, tt.out);
+               }
+
+               // for bonus points, verify that escape:unescape is an identity.
+               roundtrip, err := URLUnescape(actual);
+               if roundtrip != tt.in || err != nil {
+                       t.Errorf("URLUnescape(%q) = %q, %s; want %q, %s", actual, roundtrip, err, tt.in, "[no error]");
+               }
+       }
+}
+