exp/html: replace NUL bytes in plaintext, raw text, and RCDATA

author Andrew Balholm <andybalholm@gmail.com>

Thu, 26 Jul 2012 23:27:10 +0000 (09:27 +1000)

committer Nigel Tao <nigeltao@golang.org>

Thu, 26 Jul 2012 23:27:10 +0000 (09:27 +1000)
author Andrew Balholm <andybalholm@gmail.com>
Thu, 26 Jul 2012 23:27:10 +0000 (09:27 +1000)
committer Nigel Tao <nigeltao@golang.org>
Thu, 26 Jul 2012 23:27:10 +0000 (09:27 +1000)
diff --git a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log

index 56da0ba88f20dbf10b2a38c7960b3c3d7bf5cae7..65ee908f55e9c4fa292023466e0b81aeaa101f93 100644 (file)
--- a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log
+++ b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log
@@ -7,7 +7,7 @@ PASS "<html>\x00\n <frameset></frameset>"
  PASS "<html><select>\x00"
  PASS "\x00"
  PASS "<body>\x00"
-FAIL "<plaintext>\x00filler\x00text\x00"
+PASS "<plaintext>\x00filler\x00text\x00"
  FAIL "<svg><![CDATA[\x00filler\x00text\x00]]>"
  FAIL "<body><!\x00>"
  FAIL "<body><!\x00filler\x00text>"
diff --git a/src/pkg/exp/html/token.go b/src/pkg/exp/html/token.go

index b20de87beed082be694c22ffdfee17f0c238dfdd..3dc317ebb7df780baca02cf9cc1820efa7f11993 100644 (file)
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
@@ -152,6 +152,9 @@ type Tokenizer struct {
         rawTag string
         // textIsRaw is whether the current text token's data is not escaped.
         textIsRaw bool
+       // convertNUL is whether NUL bytes in the current token's data should
+       // be converted into \ufffd replacement characters.
+       convertNUL bool
  }
  
  // Err returns the error associated with the most recent ErrorToken token.
@@ -597,16 +600,19 @@ func (z *Tokenizer) Next() TokenType {
                         for z.err == nil {
                                 z.readByte()
                         }
+                       z.data.end = z.raw.end
                         z.textIsRaw = true
                 } else {
                         z.readRawOrRCDATA()
                 }
                 if z.data.end > z.data.start {
                         z.tt = TextToken
+                       z.convertNUL = true
                         return z.tt
                 }
         }
         z.textIsRaw = false
+       z.convertNUL = false
  
  loop:
         for {
@@ -731,6 +737,11 @@ func convertNewlines(s []byte) []byte {
         return s
  }
  
+var (
+       nul         = []byte("\x00")
+       replacement = []byte("\ufffd")
+)
+
  // Text returns the unescaped text of a text, comment or doctype token. The
  // contents of the returned slice may change on the next call to Next.
  func (z *Tokenizer) Text() []byte {
@@ -740,6 +751,9 @@ func (z *Tokenizer) Text() []byte {
                 z.data.start = z.raw.end
                 z.data.end = z.raw.end
                 s = convertNewlines(s)
+               if z.convertNUL && bytes.Contains(s, nul) {
+                       s = bytes.Replace(s, nul, replacement, -1)
+               }
                 if !z.textIsRaw {
                         s = unescape(s, false)
                 }
author	Andrew Balholm <andybalholm@gmail.com>
	Thu, 26 Jul 2012 23:27:10 +0000 (09:27 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Thu, 26 Jul 2012 23:27:10 +0000 (09:27 +1000)
src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log		patch \| blob \| history
src/pkg/exp/html/token.go		patch \| blob \| history