mime: RFC 2231 continuation / non-ASCII support

author Brad Fitzpatrick <bradfitz@golang.org>

Mon, 18 Apr 2011 17:59:39 +0000 (10:59 -0700)

committer Brad Fitzpatrick <bradfitz@golang.org>

Mon, 18 Apr 2011 17:59:39 +0000 (10:59 -0700)
author Brad Fitzpatrick <bradfitz@golang.org>
Mon, 18 Apr 2011 17:59:39 +0000 (10:59 -0700)
committer Brad Fitzpatrick <bradfitz@golang.org>
Mon, 18 Apr 2011 17:59:39 +0000 (10:59 -0700)
diff --git a/src/pkg/mime/mediatype.go b/src/pkg/mime/mediatype.go

index e9e649f95077ce9bc073e842787899e3b32cbbc7..9f8d2050e14bbd1c0aba6b609fc67638627f42ae 100644 (file)
--- a/src/pkg/mime/mediatype.go
+++ b/src/pkg/mime/mediatype.go
@@ -6,6 +6,8 @@ package mime
  
  import (
         "bytes"
+       "fmt"
+       "os"
         "strings"
         "unicode"
  )
@@ -46,11 +48,16 @@ func ParseMediaType(v string) (mediatype string, params map[string]string) {
  
         params = make(map[string]string)
  
+       // Map of base parameter name -> parameter name -> value
+       // for parameters containing a '*' character.
+       // Lazily initialized.
+       var continuation map[string]map[string]string
+
         v = v[i:]
         for len(v) > 0 {
                 v = strings.TrimLeftFunc(v, unicode.IsSpace)
                 if len(v) == 0 {
-                       return
+                       break
                 }
                 key, value, rest := consumeMediaParam(v)
                 if key == "" {
@@ -62,12 +69,83 @@ func ParseMediaType(v string) (mediatype string, params map[string]string) {
                         // Parse error.
                         return "", nil
                 }
-               params[key] = value
+
+               pmap := params
+               if idx := strings.Index(key, "*"); idx != -1 {
+                       baseName := key[:idx]
+                       if continuation == nil {
+                               continuation = make(map[string]map[string]string)
+                       }
+                       var ok bool
+                       if pmap, ok = continuation[baseName]; !ok {
+                               continuation[baseName] = make(map[string]string)
+                               pmap = continuation[baseName]
+                       }
+               }
+               if _, exists := pmap[key]; exists {
+                       // Duplicate parameter name is bogus.
+                       return "", nil
+               }
+               pmap[key] = value
                 v = rest
         }
+
+       // Stitch together any continuations or things with stars
+       // (i.e. RFC 2231 things with stars: "foo*0" or "foo*")
+       var buf bytes.Buffer
+       for key, pieceMap := range continuation {
+               singlePartKey := key + "*"
+               if v, ok := pieceMap[singlePartKey]; ok {
+                       decv := decode2231Enc(v)
+                       params[key] = decv
+                       continue
+               }
+
+               buf.Reset()
+               valid := false
+               for n := 0; ; n++ {
+                       simplePart := fmt.Sprintf("%s*%d", key, n)
+                       if v, ok := pieceMap[simplePart]; ok {
+                               valid = true
+                               buf.WriteString(v)
+                               continue
+                       }
+                       encodedPart := simplePart + "*"
+                       if v, ok := pieceMap[encodedPart]; ok {
+                               valid = true
+                               if n == 0 {
+                                       buf.WriteString(decode2231Enc(v))
+                               } else {
+                                       decv, _ := percentHexUnescape(v)
+                                       buf.WriteString(decv)
+                               }
+                       } else {
+                               break
+                       }
+               }
+               if valid {
+                       params[key] = buf.String()
+               }
+       }
+
         return
  }
  
+func decode2231Enc(v string) string {
+       sv := strings.Split(v, "'", 3)
+       if len(sv) != 3 {
+               return ""
+       }
+       // Ignoring lang in sv[1] for now.
+       charset := strings.ToLower(sv[0])
+       if charset != "us-ascii" && charset != "utf-8" {
+               // TODO: unsupported encoding
+               return ""
+       }
+       encv, _ := percentHexUnescape(sv[2])
+       return encv
+}
+
  func isNotTokenChar(rune int) bool {
         return !IsTokenChar(rune)
  }
@@ -107,17 +185,14 @@ func consumeValue(v string) (value, rest string) {
         for idx, rune = range rest {
                 switch {
                 case nextIsLiteral:
-                       if rune >= 0x80 {
-                               return "", v
-                       }
                         buffer.WriteRune(rune)
                         nextIsLiteral = false
                 case rune == leadQuote:
                         return buffer.String(), rest[idx+1:]
-               case IsQText(rune):
-                       buffer.WriteRune(rune)
                 case rune == '\\':
                         nextIsLiteral = true
+               case rune != '\r' && rune != '\n':
+                       buffer.WriteRune(rune)
                 default:
                         return "", v
                 }
@@ -137,6 +212,7 @@ func consumeMediaParam(v string) (param, value, rest string) {
         if param == "" {
                 return "", "", v
         }
+
         rest = strings.TrimLeftFunc(rest, unicode.IsSpace)
         if !strings.HasPrefix(rest, "=") {
                 return "", "", v
@@ -149,3 +225,66 @@ func consumeMediaParam(v string) (param, value, rest string) {
         }
         return param, value, rest
  }
+
+func percentHexUnescape(s string) (string, os.Error) {
+       // Count %, check that they're well-formed.
+       percents := 0
+       for i := 0; i < len(s); {
+               if s[i] != '%' {
+                       i++
+                       continue
+               }
+               percents++
+               if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
+                       s = s[i:]
+                       if len(s) > 3 {
+                               s = s[0:3]
+                       }
+                       return "", fmt.Errorf("Bogus characters after %: %q", s)
+               }
+               i += 3
+       }
+       if percents == 0 {
+               return s, nil
+       }
+
+       t := make([]byte, len(s)-2*percents)
+       j := 0
+       for i := 0; i < len(s); {
+               switch s[i] {
+               case '%':
+                       t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
+                       j++
+                       i += 3
+               default:
+                       t[j] = s[i]
+                       j++
+                       i++
+               }
+       }
+       return string(t), nil
+}
+
+func ishex(c byte) bool {
+       switch {
+       case '0' <= c && c <= '9':
+               return true
+       case 'a' <= c && c <= 'f':
+               return true
+       case 'A' <= c && c <= 'F':
+               return true
+       }
+       return false
+}
+
+func unhex(c byte) byte {
+       switch {
+       case '0' <= c && c <= '9':
+               return c - '0'
+       case 'a' <= c && c <= 'f':
+               return c - 'a' + 10
+       case 'A' <= c && c <= 'F':
+               return c - 'A' + 10
+       }
+       return 0
+}
diff --git a/src/pkg/mime/mediatype_test.go b/src/pkg/mime/mediatype_test.go

index f9603159575fa92b47ac086d88e592ff24b18e03..454ddd037781e618a7e4776d44544968fba8e031 100644 (file)
--- a/src/pkg/mime/mediatype_test.go
+++ b/src/pkg/mime/mediatype_test.go
@@ -114,6 +114,28 @@ func TestParseMediaType(t *testing.T) {
                         "form-data",
                         m("key", "value", "blah", "value", "name", "foo")},
  
+               {`foo; key=val1; key=the-key-appears-again-which-is-bogus`,
+                       "", m()},
+
+               // From RFC 2231:
+               {`application/x-stuff; title*=us-ascii'en-us'This%20is%20%2A%2A%2Afun%2A%2A%2A`,
+                       "application/x-stuff",
+                       m("title", "This is ***fun***")},
+
+               {`message/external-body; access-type=URL; ` +
+                       `URL*0="ftp://";` +
+                       `URL*1="cs.utk.edu/pub/moore/bulk-mailer/bulk-mailer.tar"`,
+                       "message/external-body",
+                       m("access-type", "URL",
+                               "URL", "ftp://cs.utk.edu/pub/moore/bulk-mailer/bulk-mailer.tar")},
+
+               {`application/x-stuff; ` +
+                       `title*0*=us-ascii'en'This%20is%20even%20more%20; ` +
+                       `title*1*=%2A%2A%2Afun%2A%2A%2A%20; ` +
+                       `title*2="isn't it!"`,
+                       "application/x-stuff",
+                       m("title", "This is even more ***fun*** isn't it!")},
+
                 // Tests from http://greenbytes.de/tech/tc2231/
                 // TODO(bradfitz): add the rest of the tests from that site.
                 {`attachment; filename="f\oo.html"`,
@@ -159,8 +181,41 @@ func TestParseMediaType(t *testing.T) {
                         "attachment",
                         m("creation-date", "Wed, 12 Feb 1997 16:29:51 -0500")},
                 {`foobar`, "foobar", m()},
-               // TODO(bradfitz): rest of them, including RFC2231 encoded UTF-8 and
-               // other charsets.
+               {`attachment; filename* =UTF-8''foo-%c3%a4.html`,
+                       "attachment",
+                       m("filename", "foo-ä.html")},
+               {`attachment; filename*=UTF-8''A-%2541.html`,
+                       "attachment",
+                       m("filename", "A-%41.html")},
+               {`attachment; filename*0="foo."; filename*1="html"`,
+                       "attachment",
+                       m("filename", "foo.html")},
+               {`attachment; filename*0*=UTF-8''foo-%c3%a4; filename*1=".html"`,
+                       "attachment",
+                       m("filename", "foo-ä.html")},
+               {`attachment; filename*0="foo"; filename*01="bar"`,
+                       "attachment",
+                       m("filename", "foo")},
+               {`attachment; filename*0="foo"; filename*2="bar"`,
+                       "attachment",
+                       m("filename", "foo")},
+               {`attachment; filename*1="foo"; filename*2="bar"`,
+                       "attachment", m()},
+               {`attachment; filename*1="bar"; filename*0="foo"`,
+                       "attachment",
+                       m("filename", "foobar")},
+               {`attachment; filename="foo-ae.html"; filename*=UTF-8''foo-%c3%a4.html`,
+                       "attachment",
+                       m("filename", "foo-ä.html")},
+               {`attachment; filename*=UTF-8''foo-%c3%a4.html; filename="foo-ae.html"`,
+                       "attachment",
+                       m("filename", "foo-ä.html")},
+
+               // Browsers also just send UTF-8 directly without RFC 2231,
+               // at least when the source page is served with UTF-8.
+               {`form-data; firstname="Брэд"; lastname="Фицпатрик"`,
+                       "form-data",
+                       m("firstname", "Брэд", "lastname", "Фицпатрик")},
         }
         for _, test := range tests {
                 mt, params := ParseMediaType(test.in)
author	Brad Fitzpatrick <bradfitz@golang.org>
	Mon, 18 Apr 2011 17:59:39 +0000 (10:59 -0700)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Mon, 18 Apr 2011 17:59:39 +0000 (10:59 -0700)
src/pkg/mime/mediatype.go		patch \| blob \| history
src/pkg/mime/mediatype_test.go		patch \| blob \| history