mime: limit UTF-8 encoded-word length to 75 characters

author Alexandre Cesaro <alexandre.cesaro@gmail.com>

Thu, 24 Sep 2015 21:45:13 +0000 (23:45 +0200)

committer Brad Fitzpatrick <bradfitz@golang.org>

Thu, 15 Oct 2015 00:08:03 +0000 (00:08 +0000)
author Alexandre Cesaro <alexandre.cesaro@gmail.com>
Thu, 24 Sep 2015 21:45:13 +0000 (23:45 +0200)
committer Brad Fitzpatrick <bradfitz@golang.org>
Thu, 15 Oct 2015 00:08:03 +0000 (00:08 +0000)
diff --git a/src/mime/encodedword.go b/src/mime/encodedword.go

index ebf6164bb6fce333f41ba1637827493057b68157..3b414dd5c4b0a94789f283822fc201898541a234 100644 (file)
--- a/src/mime/encodedword.go
+++ b/src/mime/encodedword.go
@@ -54,35 +54,129 @@ func (e WordEncoder) encodeWord(charset, s string) string {
         buf := getBuffer()
         defer putBuffer(buf)
  
+       e.openWord(buf, charset)
+       if e == BEncoding {
+               e.bEncode(buf, charset, s)
+       } else {
+               e.qEncode(buf, charset, s)
+       }
+       closeWord(buf)
+
+       return buf.String()
+}
+
+const (
+       // The maximum length of an encoded-word is 75 characters.
+       // See RFC 2047, section 2.
+       maxEncodedWordLen = 75
+       // maxContentLen is how much content can be encoded, ignoring the header and
+       // 2-byte footer.
+       maxContentLen = maxEncodedWordLen - len("=?UTF-8?") - len("?=")
+)
+
+var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
+
+// bEncode encodes s using base64 encoding and writes it to buf.
+func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) {
+       w := base64.NewEncoder(base64.StdEncoding, buf)
+       // If the charset is not UTF-8 or if the content is short, do not bother
+       // splitting the encoded-word.
+       if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
+               io.WriteString(w, s)
+               w.Close()
+               return
+       }
+
+       var currentLen, last, runeLen int
+       for i := 0; i < len(s); i += runeLen {
+               // Multi-byte characters must not be split accross encoded-words.
+               // See RFC 2047, section 5.3.
+               _, runeLen = utf8.DecodeRuneInString(s[i:])
+
+               if currentLen+runeLen <= maxBase64Len {
+                       currentLen += runeLen
+               } else {
+                       io.WriteString(w, s[last:i])
+                       w.Close()
+                       e.splitWord(buf, charset)
+                       last = i
+                       currentLen = runeLen
+               }
+       }
+       io.WriteString(w, s[last:])
+       w.Close()
+}
+
+// qEncode encodes s using Q encoding and writes it to buf. It splits the
+// encoded-words when necessary.
+func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) {
+       // We only split encoded-words when the charset is UTF-8.
+       if !isUTF8(charset) {
+               writeQString(buf, s)
+               return
+       }
+
+       var currentLen, runeLen int
+       for i := 0; i < len(s); i += runeLen {
+               b := s[i]
+               // Multi-byte characters must not be split accross encoded-words.
+               // See RFC 2047, section 5.3.
+               var encLen int
+               if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
+                       runeLen, encLen = 1, 1
+               } else {
+                       _, runeLen = utf8.DecodeRuneInString(s[i:])
+                       encLen = 3 * runeLen
+               }
+
+               if currentLen+encLen > maxContentLen {
+                       e.splitWord(buf, charset)
+                       currentLen = 0
+               }
+               writeQString(buf, s[i:i+runeLen])
+               currentLen += encLen
+       }
+}
+
+// writeQString encodes s using Q encoding and writes it to buf.
+func writeQString(buf *bytes.Buffer, s string) {
+       for i := 0; i < len(s); i++ {
+               switch b := s[i]; {
+               case b == ' ':
+                       buf.WriteByte('_')
+               case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
+                       buf.WriteByte(b)
+               default:
+                       buf.WriteByte('=')
+                       buf.WriteByte(upperhex[b>>4])
+                       buf.WriteByte(upperhex[b&0x0f])
+               }
+       }
+}
+
+// openWord writes the beginning of an encoded-word into buf.
+func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) {
         buf.WriteString("=?")
         buf.WriteString(charset)
         buf.WriteByte('?')
         buf.WriteByte(byte(e))
         buf.WriteByte('?')
+}
  
-       if e == BEncoding {
-               w := base64.NewEncoder(base64.StdEncoding, buf)
-               io.WriteString(w, s)
-               w.Close()
-       } else {
-               enc := make([]byte, 3)
-               for i := 0; i < len(s); i++ {
-                       b := s[i]
-                       switch {
-                       case b == ' ':
-                               buf.WriteByte('_')
-                       case b <= '~' && b >= '!' && b != '=' && b != '?' && b != '_':
-                               buf.WriteByte(b)
-                       default:
-                               enc[0] = '='
-                               enc[1] = upperhex[b>>4]
-                               enc[2] = upperhex[b&0x0f]
-                               buf.Write(enc)
-                       }
-               }
-       }
+// closeWord writes the end of an encoded-word into buf.
+func closeWord(buf *bytes.Buffer) {
         buf.WriteString("?=")
-       return buf.String()
+}
+
+// splitWord closes the current encoded-word and opens a new one.
+func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) {
+       closeWord(buf)
+       buf.WriteByte(' ')
+       e.openWord(buf, charset)
+}
+
+func isUTF8(charset string) bool {
+       return strings.EqualFold(charset, "UTF-8")
  }
  
  const upperhex = "0123456789ABCDEF"
diff --git a/src/mime/encodedword_test.go b/src/mime/encodedword_test.go

index b30ecba3b91d67e2605a6aaf965c37abaa8f8f49..5fcd7a06dd6a0229e65fca7d101dca6f36f2921d 100644 (file)
--- a/src/mime/encodedword_test.go
+++ b/src/mime/encodedword_test.go
@@ -27,6 +27,14 @@ func TestEncodeWord(t *testing.T) {
                 {QEncoding, iso88591, "a", "a"},
                 {QEncoding, utf8, "123 456", "123 456"},
                 {QEncoding, utf8, "\t !\"#$%&'()*+,-./ :;<>?@[\\]^_`{|}~", "\t !\"#$%&'()*+,-./ :;<>?@[\\]^_`{|}~"},
+               {QEncoding, utf8, strings.Repeat("é", 10), "=?utf-8?q?" + strings.Repeat("=C3=A9", 10) + "?="},
+               {QEncoding, utf8, strings.Repeat("é", 11), "=?utf-8?q?" + strings.Repeat("=C3=A9", 10) + "?= =?utf-8?q?=C3=A9?="},
+               {QEncoding, iso88591, strings.Repeat("\xe9", 22), "=?iso-8859-1?q?" + strings.Repeat("=E9", 22) + "?="},
+               {QEncoding, utf8, strings.Repeat("\x80", 22), "=?utf-8?q?" + strings.Repeat("=80", 21) + "?= =?utf-8?q?=80?="},
+               {BEncoding, utf8, strings.Repeat("é", 24), "=?utf-8?b?" + strings.Repeat("w6nDqcOp", 8) + "?="},
+               {BEncoding, utf8, strings.Repeat("é", 27), "=?utf-8?b?" + strings.Repeat("w6nDqcOp", 8) + "?= =?utf-8?b?w6nDqcOp?="},
+               {BEncoding, iso88591, strings.Repeat("\xe9", 45), "=?iso-8859-1?b?" + strings.Repeat("6enp", 15) + "?="},
+               {BEncoding, utf8, strings.Repeat("\x80", 51), "=?utf-8?b?" + strings.Repeat("gICA", 16) + "?= =?utf-8?b?gICA?="},
         }
  
         for _, test := range tests {
author	Alexandre Cesaro <alexandre.cesaro@gmail.com>
	Thu, 24 Sep 2015 21:45:13 +0000 (23:45 +0200)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Thu, 15 Oct 2015 00:08:03 +0000 (00:08 +0000)
src/mime/encodedword.go		patch \| blob \| history
src/mime/encodedword_test.go		patch \| blob \| history