From: Mike Samuel Date: Wed, 14 Sep 2011 00:53:55 +0000 (-0700) Subject: exp/template/html: move transition functions to a separate file X-Git-Tag: weekly.2011-09-16~41 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=15d47ce2194258021ad6f26da8ed866e42cbd1e9;p=gostls13.git exp/template/html: move transition functions to a separate file This CL moves code but makes no changes otherwise. R=nigeltao, r CC=golang-dev https://golang.org/cl/5012045 --- diff --git a/src/pkg/exp/template/html/Makefile b/src/pkg/exp/template/html/Makefile index e4fcee1ab9..620404b5ef 100644 --- a/src/pkg/exp/template/html/Makefile +++ b/src/pkg/exp/template/html/Makefile @@ -12,6 +12,7 @@ GOFILES=\ escape.go\ html.go\ js.go\ + transition.go\ url.go\ include ../../../../Make.pkg diff --git a/src/pkg/exp/template/html/escape.go b/src/pkg/exp/template/html/escape.go index a6385fe93d..22a5521340 100644 --- a/src/pkg/exp/template/html/escape.go +++ b/src/pkg/exp/template/html/escape.go @@ -12,7 +12,6 @@ import ( "fmt" "html" "os" - "strings" "template" "template/parse" ) @@ -471,555 +470,3 @@ func (e *escaper) template(name string) *template.Template { } return t } - -// transitionFunc is the array of context transition functions for text nodes. -// A transition function takes a context and template text input, and returns -// the updated context and any unconsumed text. -var transitionFunc = [...]func(context, []byte) (context, []byte){ - stateText: tText, - stateTag: tTag, - stateComment: tComment, - stateRCDATA: tSpecialTagEnd, - stateAttr: tAttr, - stateURL: tURL, - stateJS: tJS, - stateJSDqStr: tJSStr, - stateJSSqStr: tJSStr, - stateJSRegexp: tJSRegexp, - stateJSBlockCmt: tBlockCmt, - stateJSLineCmt: tLineCmt, - stateCSS: tCSS, - stateCSSDqStr: tCSSStr, - stateCSSSqStr: tCSSStr, - stateCSSDqURL: tCSSStr, - stateCSSSqURL: tCSSStr, - stateCSSURL: tCSSStr, - stateCSSBlockCmt: tBlockCmt, - stateCSSLineCmt: tLineCmt, - stateError: tError, -} - -var commentStart = []byte("") - -// tText is the context transition function for the text state. -func tText(c context, s []byte) (context, []byte) { - for { - i := bytes.IndexByte(s, '<') - if i == -1 || i+1 == len(s) { - return c, nil - } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) { - return context{state: stateComment}, s[i+4:] - } - i++ - if s[i] == '/' { - if i+1 == len(s) { - return c, nil - } - i++ - } - j, e := eatTagName(s, i) - if j != i { - // We've found an HTML tag. - return context{state: stateTag, element: e}, s[j:] - } - s = s[j:] - } - panic("unreachable") -} - -var elementContentType = [...]state{ - elementNone: stateText, - elementScript: stateJS, - elementStyle: stateCSS, - elementTextarea: stateRCDATA, - elementTitle: stateRCDATA, -} - -// tTag is the context transition function for the tag state. -func tTag(c context, s []byte) (context, []byte) { - // Find the attribute name. - attrStart := eatWhiteSpace(s, 0) - i, err := eatAttrName(s, attrStart) - if err != nil { - return context{ - state: stateError, - errStr: err.String(), - }, nil - } - if i == len(s) { - return c, nil - } - state := stateAttr - canonAttrName := strings.ToLower(string(s[attrStart:i])) - if urlAttr[canonAttrName] { - state = stateURL - } else if strings.HasPrefix(canonAttrName, "on") { - state = stateJS - } else if canonAttrName == "style" { - state = stateCSS - } - - // Look for the start of the value. - i = eatWhiteSpace(s, i) - if i == len(s) { - return c, s[i:] - } - if s[i] == '>' { - state = elementContentType[c.element] - return context{state: state, element: c.element}, s[i+1:] - } else if s[i] != '=' { - // Possible due to a valueless attribute or '/' in "". - return c, s[i:] - } - // Consume the "=". - i = eatWhiteSpace(s, i+1) - - // Find the attribute delimiter. - delim := delimSpaceOrTagEnd - if i < len(s) { - switch s[i] { - case '\'': - delim, i = delimSingleQuote, i+1 - case '"': - delim, i = delimDoubleQuote, i+1 - } - } - - return context{state: state, delim: delim, element: c.element}, s[i:] -} - -// tComment is the context transition function for stateComment. -func tComment(c context, s []byte) (context, []byte) { - i := bytes.Index(s, commentEnd) - if i != -1 { - return context{}, s[i+3:] - } - return c, nil -} - -// specialTagEndMarkers maps element types to the character sequence that -// case-insensitively signals the end of the special tag body. -var specialTagEndMarkers = [...]string{ - elementScript: "= 0 { - c.urlPart = urlPartQueryOrFrag - } else if len(s) != 0 && c.urlPart == urlPartNone { - c.urlPart = urlPartPreQuery - } - return c, nil -} - -// tJS is the context transition function for the JS state. -func tJS(c context, s []byte) (context, []byte) { - if d, t := tSpecialTagEnd(c, s); t != nil { - return d, t - } - - i := bytes.IndexAny(s, `"'/`) - if i == -1 { - // Entire input is non string, comment, regexp tokens. - c.jsCtx = nextJSCtx(s, c.jsCtx) - return c, nil - } - c.jsCtx = nextJSCtx(s[:i], c.jsCtx) - switch s[i] { - case '"': - c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp - case '\'': - c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp - case '/': - switch { - case i+1 < len(s) && s[i+1] == '/': - c.state, i = stateJSLineCmt, i+1 - case i+1 < len(s) && s[i+1] == '*': - c.state, i = stateJSBlockCmt, i+1 - case c.jsCtx == jsCtxRegexp: - c.state = stateJSRegexp - case c.jsCtx == jsCtxDivOp: - c.jsCtx = jsCtxRegexp - default: - return context{ - state: stateError, - errStr: fmt.Sprintf("'/' could start div or regexp: %.32q", s[i:]), - }, nil - } - default: - panic("unreachable") - } - return c, s[i+1:] -} - -// tJSStr is the context transition function for the JS string states. -func tJSStr(c context, s []byte) (context, []byte) { - if d, t := tSpecialTagEnd(c, s); t != nil { - return d, t - } - - quoteAndEsc := `\"` - if c.state == stateJSSqStr { - quoteAndEsc = `\'` - } - - b := s - for { - i := bytes.IndexAny(b, quoteAndEsc) - if i == -1 { - return c, nil - } - if b[i] == '\\' { - i++ - if i == len(b) { - return context{ - state: stateError, - errStr: fmt.Sprintf("unfinished escape sequence in JS string: %q", s), - }, nil - } - } else { - c.state, c.jsCtx = stateJS, jsCtxDivOp - return c, b[i+1:] - } - b = b[i+1:] - } - panic("unreachable") -} - -// tJSRegexp is the context transition function for the /RegExp/ literal state. -func tJSRegexp(c context, s []byte) (context, []byte) { - if d, t := tSpecialTagEnd(c, s); t != nil { - return d, t - } - - b := s - inCharset := false - for { - i := bytes.IndexAny(b, `/[\]`) - if i == -1 { - break - } - switch b[i] { - case '/': - if !inCharset { - c.state, c.jsCtx = stateJS, jsCtxDivOp - return c, b[i+1:] - } - case '\\': - i++ - if i == len(b) { - return context{ - state: stateError, - errStr: fmt.Sprintf("unfinished escape sequence in JS regexp: %q", s), - }, nil - } - case '[': - inCharset = true - case ']': - inCharset = false - default: - panic("unreachable") - } - b = b[i+1:] - } - - if inCharset { - // This can be fixed by making context richer if interpolation - // into charsets is desired. - return context{ - state: stateError, - errStr: fmt.Sprintf("unfinished JS regexp charset: %q", s), - }, nil - } - - return c, nil -} - -var blockCommentEnd = []byte("*/") - -// tBlockCmt is the context transition function for /*comment*/ states. -func tBlockCmt(c context, s []byte) (context, []byte) { - if d, t := tSpecialTagEnd(c, s); t != nil { - return d, t - } - i := bytes.Index(s, blockCommentEnd) - if i == -1 { - return c, nil - } - switch c.state { - case stateJSBlockCmt: - c.state = stateJS - case stateCSSBlockCmt: - c.state = stateCSS - default: - panic(c.state.String()) - } - return c, s[i+2:] -} - -// tLineCmt is the context transition function for //comment states. -func tLineCmt(c context, s []byte) (context, []byte) { - if d, t := tSpecialTagEnd(c, s); t != nil { - return d, t - } - var lineTerminators string - var endState state - switch c.state { - case stateJSLineCmt: - lineTerminators, endState = "\n\r\u2028\u2029", stateJS - case stateCSSLineCmt: - lineTerminators, endState = "\n\f\r", stateCSS - // Line comments are not part of any published CSS standard but - // are supported by the 4 major browsers. - // This defines line comments as - // LINECOMMENT ::= "//" [^\n\f\d]* - // since http://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines - // newlines: - // nl ::= #xA | #xD #xA | #xD | #xC - default: - panic(c.state.String()) - } - - i := bytes.IndexAny(s, lineTerminators) - if i == -1 { - return c, nil - } - c.state = endState - // Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4 - // "However, the LineTerminator at the end of the line is not - // considered to be part of the single-line comment; it is recognised - // separately by the lexical grammar and becomes part of the stream of - // input elements for the syntactic grammar." - return c, s[i:] -} - -// tCSS is the context transition function for the CSS state. -func tCSS(c context, s []byte) (context, []byte) { - if d, t := tSpecialTagEnd(c, s); t != nil { - return d, t - } - - // CSS quoted strings are almost never used except for: - // (1) URLs as in background: "/foo.png" - // (2) Multiword font-names as in font-family: "Times New Roman" - // (3) List separators in content values as in inline-lists: - // - // - // (4) Attribute value selectors as in a[href="http://example.com/"] - // - // We conservatively treat all strings as URLs, but make some - // allowances to avoid confusion. - // - // In (1), our conservative assumption is justified. - // In (2), valid font names do not contain ':', '?', or '#', so our - // conservative assumption is fine since we will never transition past - // urlPartPreQuery. - // In (3), our protocol heuristic should not be tripped, and there - // should not be non-space content after a '?' or '#', so as long as - // we only %-encode RFC 3986 reserved characters we are ok. - // In (4), we should URL escape for URL attributes, and for others we - // have the attribute name available if our conservative assumption - // proves problematic for real code. - - for { - i := bytes.IndexAny(s, `("'/`) - if i == -1 { - return c, nil - } - switch s[i] { - case '(': - // Look for url to the left. - p := bytes.TrimRight(s[:i], "\t\n\f\r ") - if endsWithCSSKeyword(p, "url") { - q := bytes.TrimLeft(s[i+1:], "\t\n\f\r ") - switch { - case len(q) != 0 && q[0] == '"': - c.state, s = stateCSSDqURL, q[1:] - case len(q) != 0 && q[0] == '\'': - c.state, s = stateCSSSqURL, q[1:] - - default: - c.state, s = stateCSSURL, q - } - return c, s - } - case '/': - if i+1 < len(s) { - switch s[i+1] { - case '/': - c.state = stateCSSLineCmt - return c, s[i+2:] - case '*': - c.state = stateCSSBlockCmt - return c, s[i+2:] - } - } - case '"': - c.state = stateCSSDqStr - return c, s[i+1:] - case '\'': - c.state = stateCSSSqStr - return c, s[i+1:] - } - s = s[i+1:] - } - panic("unreachable") -} - -// tCSSStr is the context transition function for the CSS string and URL states. -func tCSSStr(c context, s []byte) (context, []byte) { - if d, t := tSpecialTagEnd(c, s); t != nil { - return d, t - } - - var endAndEsc string - switch c.state { - case stateCSSDqStr, stateCSSDqURL: - endAndEsc = `\"` - case stateCSSSqStr, stateCSSSqURL: - endAndEsc = `\'` - case stateCSSURL: - // Unquoted URLs end with a newline or close parenthesis. - // The below includes the wc (whitespace character) and nl. - endAndEsc = "\\\t\n\f\r )" - default: - panic(c.state.String()) - } - - b := s - for { - i := bytes.IndexAny(b, endAndEsc) - if i == -1 { - return tURL(c, decodeCSS(b)) - } - if b[i] == '\\' { - i++ - if i == len(b) { - return context{ - state: stateError, - errStr: fmt.Sprintf("unfinished escape sequence in CSS string: %q", s), - }, nil - } - } else { - c.state = stateCSS - return c, b[i+1:] - } - c, _ = tURL(c, decodeCSS(b[:i+1])) - b = b[i+1:] - } - panic("unreachable") -} - -// tError is the context transition function for the error state. -func tError(c context, s []byte) (context, []byte) { - return c, nil -} - -// eatAttrName returns the largest j such that s[i:j] is an attribute name. -// It returns an error if s[i:] does not look like it begins with an -// attribute name, such as encountering a quote mark without a preceding -// equals sign. -func eatAttrName(s []byte, i int) (int, os.Error) { - for j := i; j < len(s); j++ { - switch s[j] { - case ' ', '\t', '\n', '\f', '\r', '=', '>': - return j, nil - case '\'', '"', '<': - // These result in a parse warning in HTML5 and are - // indicative of serious problems if seen in an attr - // name in a template. - return 0, fmt.Errorf("%q in attribute name: %.32q", s[j:j+1], s) - default: - // No-op. - } - } - return len(s), nil -} - -var elementNameMap = map[string]element{ - "script": elementScript, - "style": elementStyle, - "textarea": elementTextarea, - "title": elementTitle, -} - -// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type. -func eatTagName(s []byte, i int) (int, element) { - j := i - for ; j < len(s); j++ { - x := s[j] - if !(('a' <= x && x <= 'z') || - ('A' <= x && x <= 'Z') || - ('0' <= x && x <= '9' && i != j)) { - break - } - } - return j, elementNameMap[strings.ToLower(string(s[i:j]))] -} - -// eatWhiteSpace returns the largest j such that s[i:j] is white space. -func eatWhiteSpace(s []byte, i int) int { - for j := i; j < len(s); j++ { - switch s[j] { - case ' ', '\t', '\n', '\f', '\r': - // No-op. - default: - return j - } - } - return len(s) -} - -// urlAttr is the set of attribute names whose values are URLs. -// It consists of all "%URI"-typed attributes from -// http://www.w3.org/TR/html4/index/attributes.html -// as well as those attributes defined at -// http://dev.w3.org/html5/spec/index.html#attributes-1 -// whose Value column in that table matches -// "Valid [non-empty] URL potentially surrounded by spaces". -var urlAttr = map[string]bool{ - "action": true, - "archive": true, - "background": true, - "cite": true, - "classid": true, - "codebase": true, - "data": true, - "formaction": true, - "href": true, - "icon": true, - "longdesc": true, - "manifest": true, - "poster": true, - "profile": true, - "src": true, - "usemap": true, -} diff --git a/src/pkg/exp/template/html/transition.go b/src/pkg/exp/template/html/transition.go new file mode 100644 index 0000000000..117b20a5bf --- /dev/null +++ b/src/pkg/exp/template/html/transition.go @@ -0,0 +1,564 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package html + +import ( + "bytes" + "fmt" + "os" + "strings" +) + +// transitionFunc is the array of context transition functions for text nodes. +// A transition function takes a context and template text input, and returns +// the updated context and any unconsumed text. +var transitionFunc = [...]func(context, []byte) (context, []byte){ + stateText: tText, + stateTag: tTag, + stateComment: tComment, + stateRCDATA: tSpecialTagEnd, + stateAttr: tAttr, + stateURL: tURL, + stateJS: tJS, + stateJSDqStr: tJSStr, + stateJSSqStr: tJSStr, + stateJSRegexp: tJSRegexp, + stateJSBlockCmt: tBlockCmt, + stateJSLineCmt: tLineCmt, + stateCSS: tCSS, + stateCSSDqStr: tCSSStr, + stateCSSSqStr: tCSSStr, + stateCSSDqURL: tCSSStr, + stateCSSSqURL: tCSSStr, + stateCSSURL: tCSSStr, + stateCSSBlockCmt: tBlockCmt, + stateCSSLineCmt: tLineCmt, + stateError: tError, +} + +var commentStart = []byte("") + +// tText is the context transition function for the text state. +func tText(c context, s []byte) (context, []byte) { + for { + i := bytes.IndexByte(s, '<') + if i == -1 || i+1 == len(s) { + return c, nil + } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) { + return context{state: stateComment}, s[i+4:] + } + i++ + if s[i] == '/' { + if i+1 == len(s) { + return c, nil + } + i++ + } + j, e := eatTagName(s, i) + if j != i { + // We've found an HTML tag. + return context{state: stateTag, element: e}, s[j:] + } + s = s[j:] + } + panic("unreachable") +} + +var elementContentType = [...]state{ + elementNone: stateText, + elementScript: stateJS, + elementStyle: stateCSS, + elementTextarea: stateRCDATA, + elementTitle: stateRCDATA, +} + +// tTag is the context transition function for the tag state. +func tTag(c context, s []byte) (context, []byte) { + // Find the attribute name. + attrStart := eatWhiteSpace(s, 0) + i, err := eatAttrName(s, attrStart) + if err != nil { + return context{ + state: stateError, + errStr: err.String(), + }, nil + } + if i == len(s) { + return c, nil + } + state := stateAttr + canonAttrName := strings.ToLower(string(s[attrStart:i])) + if urlAttr[canonAttrName] { + state = stateURL + } else if strings.HasPrefix(canonAttrName, "on") { + state = stateJS + } else if canonAttrName == "style" { + state = stateCSS + } + + // Look for the start of the value. + i = eatWhiteSpace(s, i) + if i == len(s) { + return c, s[i:] + } + if s[i] == '>' { + state = elementContentType[c.element] + return context{state: state, element: c.element}, s[i+1:] + } else if s[i] != '=' { + // Possible due to a valueless attribute or '/' in "". + return c, s[i:] + } + // Consume the "=". + i = eatWhiteSpace(s, i+1) + + // Find the attribute delimiter. + delim := delimSpaceOrTagEnd + if i < len(s) { + switch s[i] { + case '\'': + delim, i = delimSingleQuote, i+1 + case '"': + delim, i = delimDoubleQuote, i+1 + } + } + + return context{state: state, delim: delim, element: c.element}, s[i:] +} + +// tComment is the context transition function for stateComment. +func tComment(c context, s []byte) (context, []byte) { + i := bytes.Index(s, commentEnd) + if i != -1 { + return context{}, s[i+3:] + } + return c, nil +} + +// specialTagEndMarkers maps element types to the character sequence that +// case-insensitively signals the end of the special tag body. +var specialTagEndMarkers = [...]string{ + elementScript: "= 0 { + c.urlPart = urlPartQueryOrFrag + } else if len(s) != 0 && c.urlPart == urlPartNone { + c.urlPart = urlPartPreQuery + } + return c, nil +} + +// tJS is the context transition function for the JS state. +func tJS(c context, s []byte) (context, []byte) { + if d, t := tSpecialTagEnd(c, s); t != nil { + return d, t + } + + i := bytes.IndexAny(s, `"'/`) + if i == -1 { + // Entire input is non string, comment, regexp tokens. + c.jsCtx = nextJSCtx(s, c.jsCtx) + return c, nil + } + c.jsCtx = nextJSCtx(s[:i], c.jsCtx) + switch s[i] { + case '"': + c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp + case '\'': + c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp + case '/': + switch { + case i+1 < len(s) && s[i+1] == '/': + c.state, i = stateJSLineCmt, i+1 + case i+1 < len(s) && s[i+1] == '*': + c.state, i = stateJSBlockCmt, i+1 + case c.jsCtx == jsCtxRegexp: + c.state = stateJSRegexp + case c.jsCtx == jsCtxDivOp: + c.jsCtx = jsCtxRegexp + default: + return context{ + state: stateError, + errStr: fmt.Sprintf("'/' could start div or regexp: %.32q", s[i:]), + }, nil + } + default: + panic("unreachable") + } + return c, s[i+1:] +} + +// tJSStr is the context transition function for the JS string states. +func tJSStr(c context, s []byte) (context, []byte) { + if d, t := tSpecialTagEnd(c, s); t != nil { + return d, t + } + + quoteAndEsc := `\"` + if c.state == stateJSSqStr { + quoteAndEsc = `\'` + } + + b := s + for { + i := bytes.IndexAny(b, quoteAndEsc) + if i == -1 { + return c, nil + } + if b[i] == '\\' { + i++ + if i == len(b) { + return context{ + state: stateError, + errStr: fmt.Sprintf("unfinished escape sequence in JS string: %q", s), + }, nil + } + } else { + c.state, c.jsCtx = stateJS, jsCtxDivOp + return c, b[i+1:] + } + b = b[i+1:] + } + panic("unreachable") +} + +// tJSRegexp is the context transition function for the /RegExp/ literal state. +func tJSRegexp(c context, s []byte) (context, []byte) { + if d, t := tSpecialTagEnd(c, s); t != nil { + return d, t + } + + b := s + inCharset := false + for { + i := bytes.IndexAny(b, `/[\]`) + if i == -1 { + break + } + switch b[i] { + case '/': + if !inCharset { + c.state, c.jsCtx = stateJS, jsCtxDivOp + return c, b[i+1:] + } + case '\\': + i++ + if i == len(b) { + return context{ + state: stateError, + errStr: fmt.Sprintf("unfinished escape sequence in JS regexp: %q", s), + }, nil + } + case '[': + inCharset = true + case ']': + inCharset = false + default: + panic("unreachable") + } + b = b[i+1:] + } + + if inCharset { + // This can be fixed by making context richer if interpolation + // into charsets is desired. + return context{ + state: stateError, + errStr: fmt.Sprintf("unfinished JS regexp charset: %q", s), + }, nil + } + + return c, nil +} + +var blockCommentEnd = []byte("*/") + +// tBlockCmt is the context transition function for /*comment*/ states. +func tBlockCmt(c context, s []byte) (context, []byte) { + if d, t := tSpecialTagEnd(c, s); t != nil { + return d, t + } + i := bytes.Index(s, blockCommentEnd) + if i == -1 { + return c, nil + } + switch c.state { + case stateJSBlockCmt: + c.state = stateJS + case stateCSSBlockCmt: + c.state = stateCSS + default: + panic(c.state.String()) + } + return c, s[i+2:] +} + +// tLineCmt is the context transition function for //comment states. +func tLineCmt(c context, s []byte) (context, []byte) { + if d, t := tSpecialTagEnd(c, s); t != nil { + return d, t + } + var lineTerminators string + var endState state + switch c.state { + case stateJSLineCmt: + lineTerminators, endState = "\n\r\u2028\u2029", stateJS + case stateCSSLineCmt: + lineTerminators, endState = "\n\f\r", stateCSS + // Line comments are not part of any published CSS standard but + // are supported by the 4 major browsers. + // This defines line comments as + // LINECOMMENT ::= "//" [^\n\f\d]* + // since http://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines + // newlines: + // nl ::= #xA | #xD #xA | #xD | #xC + default: + panic(c.state.String()) + } + + i := bytes.IndexAny(s, lineTerminators) + if i == -1 { + return c, nil + } + c.state = endState + // Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4 + // "However, the LineTerminator at the end of the line is not + // considered to be part of the single-line comment; it is recognised + // separately by the lexical grammar and becomes part of the stream of + // input elements for the syntactic grammar." + return c, s[i:] +} + +// tCSS is the context transition function for the CSS state. +func tCSS(c context, s []byte) (context, []byte) { + if d, t := tSpecialTagEnd(c, s); t != nil { + return d, t + } + + // CSS quoted strings are almost never used except for: + // (1) URLs as in background: "/foo.png" + // (2) Multiword font-names as in font-family: "Times New Roman" + // (3) List separators in content values as in inline-lists: + // + // + // (4) Attribute value selectors as in a[href="http://example.com/"] + // + // We conservatively treat all strings as URLs, but make some + // allowances to avoid confusion. + // + // In (1), our conservative assumption is justified. + // In (2), valid font names do not contain ':', '?', or '#', so our + // conservative assumption is fine since we will never transition past + // urlPartPreQuery. + // In (3), our protocol heuristic should not be tripped, and there + // should not be non-space content after a '?' or '#', so as long as + // we only %-encode RFC 3986 reserved characters we are ok. + // In (4), we should URL escape for URL attributes, and for others we + // have the attribute name available if our conservative assumption + // proves problematic for real code. + + for { + i := bytes.IndexAny(s, `("'/`) + if i == -1 { + return c, nil + } + switch s[i] { + case '(': + // Look for url to the left. + p := bytes.TrimRight(s[:i], "\t\n\f\r ") + if endsWithCSSKeyword(p, "url") { + q := bytes.TrimLeft(s[i+1:], "\t\n\f\r ") + switch { + case len(q) != 0 && q[0] == '"': + c.state, s = stateCSSDqURL, q[1:] + case len(q) != 0 && q[0] == '\'': + c.state, s = stateCSSSqURL, q[1:] + + default: + c.state, s = stateCSSURL, q + } + return c, s + } + case '/': + if i+1 < len(s) { + switch s[i+1] { + case '/': + c.state = stateCSSLineCmt + return c, s[i+2:] + case '*': + c.state = stateCSSBlockCmt + return c, s[i+2:] + } + } + case '"': + c.state = stateCSSDqStr + return c, s[i+1:] + case '\'': + c.state = stateCSSSqStr + return c, s[i+1:] + } + s = s[i+1:] + } + panic("unreachable") +} + +// tCSSStr is the context transition function for the CSS string and URL states. +func tCSSStr(c context, s []byte) (context, []byte) { + if d, t := tSpecialTagEnd(c, s); t != nil { + return d, t + } + + var endAndEsc string + switch c.state { + case stateCSSDqStr, stateCSSDqURL: + endAndEsc = `\"` + case stateCSSSqStr, stateCSSSqURL: + endAndEsc = `\'` + case stateCSSURL: + // Unquoted URLs end with a newline or close parenthesis. + // The below includes the wc (whitespace character) and nl. + endAndEsc = "\\\t\n\f\r )" + default: + panic(c.state.String()) + } + + b := s + for { + i := bytes.IndexAny(b, endAndEsc) + if i == -1 { + return tURL(c, decodeCSS(b)) + } + if b[i] == '\\' { + i++ + if i == len(b) { + return context{ + state: stateError, + errStr: fmt.Sprintf("unfinished escape sequence in CSS string: %q", s), + }, nil + } + } else { + c.state = stateCSS + return c, b[i+1:] + } + c, _ = tURL(c, decodeCSS(b[:i+1])) + b = b[i+1:] + } + panic("unreachable") +} + +// tError is the context transition function for the error state. +func tError(c context, s []byte) (context, []byte) { + return c, nil +} + +// eatAttrName returns the largest j such that s[i:j] is an attribute name. +// It returns an error if s[i:] does not look like it begins with an +// attribute name, such as encountering a quote mark without a preceding +// equals sign. +func eatAttrName(s []byte, i int) (int, os.Error) { + for j := i; j < len(s); j++ { + switch s[j] { + case ' ', '\t', '\n', '\f', '\r', '=', '>': + return j, nil + case '\'', '"', '<': + // These result in a parse warning in HTML5 and are + // indicative of serious problems if seen in an attr + // name in a template. + return 0, fmt.Errorf("%q in attribute name: %.32q", s[j:j+1], s) + default: + // No-op. + } + } + return len(s), nil +} + +var elementNameMap = map[string]element{ + "script": elementScript, + "style": elementStyle, + "textarea": elementTextarea, + "title": elementTitle, +} + +// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type. +func eatTagName(s []byte, i int) (int, element) { + j := i + for ; j < len(s); j++ { + x := s[j] + if !(('a' <= x && x <= 'z') || + ('A' <= x && x <= 'Z') || + ('0' <= x && x <= '9' && i != j)) { + break + } + } + return j, elementNameMap[strings.ToLower(string(s[i:j]))] +} + +// eatWhiteSpace returns the largest j such that s[i:j] is white space. +func eatWhiteSpace(s []byte, i int) int { + for j := i; j < len(s); j++ { + switch s[j] { + case ' ', '\t', '\n', '\f', '\r': + // No-op. + default: + return j + } + } + return len(s) +} + +// urlAttr is the set of attribute names whose values are URLs. +// It consists of all "%URI"-typed attributes from +// http://www.w3.org/TR/html4/index/attributes.html +// as well as those attributes defined at +// http://dev.w3.org/html5/spec/index.html#attributes-1 +// whose Value column in that table matches +// "Valid [non-empty] URL potentially surrounded by spaces". +var urlAttr = map[string]bool{ + "action": true, + "archive": true, + "background": true, + "cite": true, + "classid": true, + "codebase": true, + "data": true, + "formaction": true, + "href": true, + "icon": true, + "longdesc": true, + "manifest": true, + "poster": true, + "profile": true, + "src": true, + "usemap": true, +}