exp/html: parse CDATA sections in foreign content

author Andrew Balholm <andybalholm@gmail.com>

Fri, 27 Jul 2012 06:05:25 +0000 (16:05 +1000)

committer Nigel Tao <nigeltao@golang.org>

Fri, 27 Jul 2012 06:05:25 +0000 (16:05 +1000)
author Andrew Balholm <andybalholm@gmail.com>
Fri, 27 Jul 2012 06:05:25 +0000 (16:05 +1000)
committer Nigel Tao <nigeltao@golang.org>
Fri, 27 Jul 2012 06:05:25 +0000 (16:05 +1000)
diff --git a/src/pkg/exp/html/parse.go b/src/pkg/exp/html/parse.go

index 82edb2263f8cda0f19b3f7529a341ef5a6cda1f5..986e9bbe460e78d35dc80511476650171106a13e 100644 (file)
--- a/src/pkg/exp/html/parse.go
+++ b/src/pkg/exp/html/parse.go
@@ -390,6 +390,10 @@ func (p *parser) reconstructActiveFormattingElements() {
  
  // read reads the next token from the tokenizer.
  func (p *parser) read() error {
+       // CDATA sections are allowed only in foreign content.
+       n := p.oe.top()
+       p.tokenizer.cdataOK = n != nil && n.Namespace != ""
+
         p.tokenizer.Next()
         p.tok = p.tokenizer.Token()
         if p.tok.Type == ErrorToken {
diff --git a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log

index 65ee908f55e9c4fa292023466e0b81aeaa101f93..b63a237a15171f4b3d50b70c47a89a39fa610490 100644 (file)
--- a/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log
+++ b/src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log
@@ -8,9 +8,9 @@ PASS "<html><select>\x00"
  PASS "\x00"
  PASS "<body>\x00"
  PASS "<plaintext>\x00filler\x00text\x00"
-FAIL "<svg><![CDATA[\x00filler\x00text\x00]]>"
-FAIL "<body><!\x00>"
-FAIL "<body><!\x00filler\x00text>"
+PASS "<svg><![CDATA[\x00filler\x00text\x00]]>"
+PASS "<body><!\x00>"
+PASS "<body><!\x00filler\x00text>"
  PASS "<body><svg><foreignObject>\x00filler\x00text"
  FAIL "<svg>\x00filler\x00text"
  FAIL "<svg>\x00<frameset>"
diff --git a/src/pkg/exp/html/testlogs/tests21.dat.log b/src/pkg/exp/html/testlogs/tests21.dat.log

index c60a8cc3118bef2f8fab5c7413e7d17ed429f49e..98a1da886c0f45b4655162d338e394c3d717c505 100644 (file)
--- a/src/pkg/exp/html/testlogs/tests21.dat.log
+++ b/src/pkg/exp/html/testlogs/tests21.dat.log
@@ -1,22 +1,22 @@
-FAIL "<svg><![CDATA[foo]]>"
-FAIL "<math><![CDATA[foo]]>"
+PASS "<svg><![CDATA[foo]]>"
+PASS "<math><![CDATA[foo]]>"
  PASS "<div><![CDATA[foo]]>"
-FAIL "<svg><![CDATA[foo"
-FAIL "<svg><![CDATA[foo"
-FAIL "<svg><![CDATA["
-FAIL "<svg><![CDATA[]]>"
-FAIL "<svg><![CDATA[]] >]]>"
-FAIL "<svg><![CDATA[]] >]]>"
-FAIL "<svg><![CDATA[]]"
-FAIL "<svg><![CDATA[]"
-FAIL "<svg><![CDATA[]>a"
+PASS "<svg><![CDATA[foo"
+PASS "<svg><![CDATA[foo"
+PASS "<svg><![CDATA["
+PASS "<svg><![CDATA[]]>"
+PASS "<svg><![CDATA[]] >]]>"
+PASS "<svg><![CDATA[]] >]]>"
+PASS "<svg><![CDATA[]]"
+PASS "<svg><![CDATA[]"
+PASS "<svg><![CDATA[]>a"
  PASS "<svg><foreignObject><div><![CDATA[foo]]>"
-FAIL "<svg><![CDATA[<svg>]]>"
-FAIL "<svg><![CDATA[</svg>a]]>"
-FAIL "<svg><![CDATA[<svg>a"
-FAIL "<svg><![CDATA[</svg>a"
-FAIL "<svg><![CDATA[<svg>]]><path>"
-FAIL "<svg><![CDATA[<svg>]]></path>"
-FAIL "<svg><![CDATA[<svg>]]><!--path-->"
-FAIL "<svg><![CDATA[<svg>]]>path"
-FAIL "<svg><![CDATA[<!--svg-->]]>"
+PASS "<svg><![CDATA[<svg>]]>"
+PASS "<svg><![CDATA[</svg>a]]>"
+PASS "<svg><![CDATA[<svg>a"
+PASS "<svg><![CDATA[</svg>a"
+PASS "<svg><![CDATA[<svg>]]><path>"
+PASS "<svg><![CDATA[<svg>]]></path>"
+PASS "<svg><![CDATA[<svg>]]><!--path-->"
+PASS "<svg><![CDATA[<svg>]]>path"
+PASS "<svg><![CDATA[<!--svg-->]]>"
diff --git a/src/pkg/exp/html/token.go b/src/pkg/exp/html/token.go

index 3dc317ebb7df780baca02cf9cc1820efa7f11993..d4867fc17392bc2b591645b905c5b04543ed4bfe 100644 (file)
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
@@ -155,6 +155,8 @@ type Tokenizer struct {
         // convertNUL is whether NUL bytes in the current token's data should
         // be converted into \ufffd replacement characters.
         convertNUL bool
+       // cdataOK is whether CDATA sections are allowed in the current context.
+       cdataOK bool
  }
  
  // Err returns the error associated with the most recent ErrorToken token.
@@ -347,8 +349,8 @@ func (z *Tokenizer) readUntilCloseAngle() {
  }
  
  // readMarkupDeclaration reads the next token starting with "<!". It might be
-// a "<!--comment-->", a "<!DOCTYPE foo>", or "<!a bogus comment". The opening
-// "<!" has already been consumed.
+// a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or
+// "<!a bogus comment". The opening "<!" has already been consumed.
  func (z *Tokenizer) readMarkupDeclaration() TokenType {
         z.data.start = z.raw.end
         var c [2]byte
@@ -364,27 +366,81 @@ func (z *Tokenizer) readMarkupDeclaration() TokenType {
                 return CommentToken
         }
         z.raw.end -= 2
+       if z.readDoctype() {
+               return DoctypeToken
+       }
+       if z.cdataOK && z.readCDATA() {
+               z.convertNUL = true
+               return TextToken
+       }
+       // It's a bogus comment.
+       z.readUntilCloseAngle()
+       return CommentToken
+}
+
+// readDoctype attempts to read a doctype declaration and returns true if
+// successful. The opening "<!" has already been consumed.
+func (z *Tokenizer) readDoctype() bool {
         const s = "DOCTYPE"
         for i := 0; i < len(s); i++ {
                 c := z.readByte()
                 if z.err != nil {
                         z.data.end = z.raw.end
-                       return CommentToken
+                       return false
                 }
                 if c != s[i] && c != s[i]+('a'-'A') {
                         // Back up to read the fragment of "DOCTYPE" again.
                         z.raw.end = z.data.start
-                       z.readUntilCloseAngle()
-                       return CommentToken
+                       return false
                 }
         }
         if z.skipWhiteSpace(); z.err != nil {
                 z.data.start = z.raw.end
                 z.data.end = z.raw.end
-               return DoctypeToken
+               return true
         }
         z.readUntilCloseAngle()
-       return DoctypeToken
+       return true
+}
+
+// readCDATA attempts to read a CDATA section and returns true if
+// successful. The opening "<!" has already been consumed.
+func (z *Tokenizer) readCDATA() bool {
+       const s = "[CDATA["
+       for i := 0; i < len(s); i++ {
+               c := z.readByte()
+               if z.err != nil {
+                       z.data.end = z.raw.end
+                       return false
+               }
+               if c != s[i] {
+                       // Back up to read the fragment of "[CDATA[" again.
+                       z.raw.end = z.data.start
+                       return false
+               }
+       }
+       z.data.start = z.raw.end
+       brackets := 0
+       for {
+               c := z.readByte()
+               if z.err != nil {
+                       z.data.end = z.raw.end
+                       return true
+               }
+               switch c {
+               case ']':
+                       brackets++
+               case '>':
+                       if brackets >= 2 {
+                               z.data.end = z.raw.end - len("]]>")
+                               return true
+                       }
+                       brackets = 0
+               default:
+                       brackets = 0
+               }
+       }
+       panic("unreachable")
  }
  
  // startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end]
@@ -751,7 +807,7 @@ func (z *Tokenizer) Text() []byte {
                 z.data.start = z.raw.end
                 z.data.end = z.raw.end
                 s = convertNewlines(s)
-               if z.convertNUL && bytes.Contains(s, nul) {
+               if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) {
                         s = bytes.Replace(s, nul, replacement, -1)
                 }
                 if !z.textIsRaw {
author	Andrew Balholm <andybalholm@gmail.com>
	Fri, 27 Jul 2012 06:05:25 +0000 (16:05 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Fri, 27 Jul 2012 06:05:25 +0000 (16:05 +1000)
src/pkg/exp/html/parse.go		patch \| blob \| history
src/pkg/exp/html/testlogs/plain-text-unsafe.dat.log		patch \| blob \| history
src/pkg/exp/html/testlogs/tests21.dat.log		patch \| blob \| history
src/pkg/exp/html/token.go		patch \| blob \| history