From: Nigel Tao Date: Fri, 14 Oct 2011 04:22:02 +0000 (+1100) Subject: html: fix some tokenizer bugs with attribute key/values. X-Git-Tag: weekly.2011-10-18~79 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=b82a8e7c22b1445fd063b8ddb55f9fb474b999ec;p=gostls13.git html: fix some tokenizer bugs with attribute key/values. The relevant spec sections are 13.2.4.38-13.2.4.40. http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#attribute-value-(double-quoted)-state R=andybalholm CC=golang-dev https://golang.org/cl/5262044 --- diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index 64b7008870..dcece8cacb 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -205,14 +205,11 @@ func (z *Tokenizer) readByte() byte { return x } -func (z *Tokenizer) savePendingAttr() { - if z.pendingAttr[0].start != z.pendingAttr[0].end { - z.attr = append(z.attr, z.pendingAttr) - } -} - // skipWhiteSpace skips past any white space. func (z *Tokenizer) skipWhiteSpace() { + if z.err != nil { + return + } for { c := z.readByte() if z.err != nil { @@ -332,135 +329,132 @@ func (z *Tokenizer) nextTag() { z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags") return } - // Read the tag name, and attribute key/value pairs. - if z.readTagName() { - for z.readTagAttrKey() && z.readTagAttrVal() { - z.savePendingAttr() + // Read the tag name and attribute key/value pairs. + z.readTagName() + for { + if z.skipWhiteSpace(); z.err != nil { + break + } + c := z.readByte() + if z.err != nil || c == '>' { + break + } + z.raw.end-- + z.readTagAttrKey() + z.readTagAttrVal() + // Save pendingAttr if it has a non-empty key. + if z.pendingAttr[0].start != z.pendingAttr[0].end { + z.attr = append(z.attr, z.pendingAttr) } - } - // If we didn't get a final ">", assume that it's a text token. - // TODO: this isn't right: html5lib treats "

" and returns whether the tag -// may have attributes. -func (z *Tokenizer) readTagName() (more bool) { +// readTagName sets z.data to the "p" in "

". +func (z *Tokenizer) readTagName() { for { c := z.readByte() if z.err != nil { - return false + z.data.end = z.raw.end + return } switch c { - case ' ', '\n', '\t', '\f', '/': + case ' ', '\n', '\r', '\t', '\f': z.data.end = z.raw.end - 1 - return true - case '>': - // We cannot have a self-closing token, since the case above catches - // the "/" in "

". - z.data.end = z.raw.end - len(">") - return false + return + case '/', '>': + z.raw.end-- + z.data.end = z.raw.end + return } } - panic("unreachable") } -// readTagAttrKey sets z.pendingAttr[0] to the "a" in "

" and returns -// whether the tag may have an attribute value. -func (z *Tokenizer) readTagAttrKey() (more bool) { - if z.skipWhiteSpace(); z.err != nil { - return false - } +// readTagAttrKey sets z.pendingAttr[0] to the "k" in "

". +// Precondition: z.err == nil. +func (z *Tokenizer) readTagAttrKey() { z.pendingAttr[0].start = z.raw.end - z.pendingAttr[0].end = z.raw.end - z.pendingAttr[1].start = z.raw.end - z.pendingAttr[1].end = z.raw.end for { c := z.readByte() if z.err != nil { - return false + z.pendingAttr[0].end = z.raw.end + return } switch c { case ' ', '\n', '\r', '\t', '\f', '/': z.pendingAttr[0].end = z.raw.end - 1 - return true - case '=': + return + case '=', '>': z.raw.end-- z.pendingAttr[0].end = z.raw.end - return true - case '>': - z.pendingAttr[0].end = z.raw.end - 1 - z.savePendingAttr() - return false + return } } - panic("unreachable") } -// readTagAttrVal sets z.pendingAttr[1] to the "1" in "

" and returns -// whether the tag may have more attributes. -func (z *Tokenizer) readTagAttrVal() (more bool) { +// readTagAttrVal sets z.pendingAttr[1] to the "v" in "

". +func (z *Tokenizer) readTagAttrVal() { + z.pendingAttr[1].start = z.raw.end + z.pendingAttr[1].end = z.raw.end if z.skipWhiteSpace(); z.err != nil { - return false + return } - for { - c := z.readByte() - if z.err != nil { - return false - } - if c == '=' { - break - } + c := z.readByte() + if z.err != nil { + return + } + if c != '=' { z.raw.end-- - return true + return } if z.skipWhiteSpace(); z.err != nil { - return false + return } + quote := z.readByte() + if z.err != nil { + return + } + switch quote { + case '>': + z.raw.end-- + return - const delimAnyWhiteSpace = 1 -loop: - for delim := byte(0); ; { - c := z.readByte() - if z.err != nil { - return false + case '\'', '"': + z.pendingAttr[1].start = z.raw.end + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } + if c == quote { + z.pendingAttr[1].end = z.raw.end - 1 + return + } } - if delim == 0 { + + default: + z.pendingAttr[1].start = z.raw.end - 1 + for { + c := z.readByte() + if z.err != nil { + z.pendingAttr[1].end = z.raw.end + return + } switch c { - case '\'', '"': - delim = c - default: - delim = delimAnyWhiteSpace + case ' ', '\n', '\r', '\t', '\f': + z.pendingAttr[1].end = z.raw.end - 1 + return + case '>': z.raw.end-- + z.pendingAttr[1].end = z.raw.end + return } - z.pendingAttr[1].start = z.raw.end - continue - } - switch c { - case '/', '>': - z.raw.end-- - z.pendingAttr[1].end = z.raw.end - break loop - case ' ', '\n', '\r', '\t', '\f': - if delim != delimAnyWhiteSpace { - continue - } - fallthrough - case delim: - z.pendingAttr[1].end = z.raw.end - 1 - break loop } } - return true } // nextText reads all text up until an '<'. diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index 178df27d14..f1082fce43 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -52,21 +52,38 @@ var tokenTests = []tokenTest{ `

`, `

`, }, - /* - // TODO: re-enable these tests when they work. This input/output matches html5lib's behavior. - { - "malformed tag #2", - `

`, - `

`, - }, - { - "malformed tag #3", - `

`, - }, - */ + { + "malformed tag #2", + `

`, + }, + { + "malformed tag #3", + `

`, + }, { "malformed tag #4", + `

`, + `

`, + }, + { + "malformed tag #5", + `

`, + }, + { + "malformed tag #6", + `

`, + `

`, + }, + { + "malformed tag #7", + `

`, + }, + { + "malformed tag #8", `

`, `

`, },