Augments type context and adds grammatical rules to handle special HTML constructs:
<!-- comments -->
<script>raw text</script>
<textarea>no tags here</textarea>
This CL does not elide comment content. I recommend we do that but
have not done it in this CL.
I used a codesearch tool over a codebase in another template language.
Based on the below I think we should definitely recognize
<script>, <style>, <textarea>, and <title>
as each of these appears frequently enough that there are few
template using apps that do not use most of them.
Of the other special tags,
<xmp>, <noscript>
are used but infrequently, and
<noframe> and friend, <listing>
do not appear at all.
We could support <xmp> even though it is obsolete in HTML5
because we already have the machinery, but I suggest we do not
support noscript since it is a normal tag in some browser
configurations.
I suggest recognizing and eliding <!-- comments -->
(but not escaping text spans) as they are widely used to
embed comments in template source. Not eliding them increases
the size of content sent over the network, and risks leaking
code and project internal details.
The template language I tested elides them so there are
no instance of IE conditional compilation directives in the
codebase but that could be a source of confusion.
The codesearch does the equivalent of
$ find . -name \*.file-extension \
| perl -ne 'print "\L$1\n" while s@<([a-z][a-z0-9])@@i' \
| sort | uniq -c | sort
The 5 uses of <plaintext> seem to be in tricky code and can be ignored.
The 2 uses of <xmp> appear in the same tricky code and can be ignored.
I also ignored end tags to avoid biasing against unary
elements and threw out some nonsense names since since the
long tail is dominated by uses of < as a comparison operator
in the template languages expression language.
I have added asterisks next to abnormal elements.
26765 div
7432 span
7414 td
4233 a
3730 tr
3238 input
2102 br
1756 li
1755 img
1674 table
1388 p
1311 th
1064 option
992 b
891 label
714 script *
519 ul
446 tbody
412 button
381 form
377 h2
358 select
353 strong
318 h3
314 body
303 html
266 link
262 textarea *
261 head
258 meta
225 title *
189 h1
176 col
156 style *
151 hr
119 iframe
103 h4
101 pre
100 dt
98 thead
90 dd
83 map
80 i
69 object
66 ol
65 em
60 param
60 font
57 fieldset
51 string
51 field
51 center
44 bidi
37 kbd
35 legend
30 nobr
29 dl
28 var
26 small
21 cite
21 base
20 embed
19 colgroup
12 u
12 canvas
10 sup
10 rect
10 optgroup
10 noscript *
9 wbr
9 blockquote
8 tfoot
8 code
8 caption
8 abbr
7 msg
6 tt
6 text
6 h5
5 svg
5 plaintext *
5 article
4 shortquote
4 number
4 menu
4 ins
3 progress
3 header
3 content
3 bool
3 audio
3 attribute
3 acronym
2 xmp *
2 overwrite
2 objects
2 nobreak
2 metadata
2 description
2 datasource
2 category
2 action
R=nigeltao
CC=golang-dev
https://golang.org/cl/
4964045
delim delim
urlPart urlPart
jsCtx jsCtx
+ element element
errLine int
errStr string
}
// eq returns whether two contexts are equal.
func (c context) eq(d context) bool {
- return c.state == d.state && c.delim == d.delim && c.urlPart == d.urlPart && c.jsCtx == d.jsCtx && c.errLine == d.errLine && c.errStr == d.errStr
+ return c.state == d.state &&
+ c.delim == d.delim &&
+ c.urlPart == d.urlPart &&
+ c.jsCtx == d.jsCtx &&
+ c.element == d.element &&
+ c.errLine == d.errLine &&
+ c.errStr == d.errStr
}
// state describes a high-level HTML parser state.
stateText state = iota
// stateTag occurs before an HTML attribute or the end of a tag.
stateTag
+ // stateComment occurs inside an <!-- HTML comment -->.
+ stateComment
+ // stateRCDATA occurs inside an RCDATA element (<textarea> or <title>)
+ // as described at http://dev.w3.org/html5/spec/syntax.html#elements-0
+ stateRCDATA
// stateAttr occurs inside an HTML attribute whose content is text.
stateAttr
// stateURL occurs inside an HTML attribute whose content is a URL.
var stateNames = [...]string{
stateText: "stateText",
stateTag: "stateTag",
+ stateComment: "stateComment",
+ stateRCDATA: "stateRCDATA",
stateAttr: "stateAttr",
stateURL: "stateURL",
stateJS: "stateJS",
}
return fmt.Sprintf("illegal jsCtx %d", c)
}
+
+// element identifies the HTML element when inside a start tag or special body.
+// Certain HTML element (for example <script> and <style>) have bodies that are
+// treated differently from stateText so the element type is necessary to
+// transition into the correct context at the end of a tag and to identify the
+// end delimiter for the body.
+type element uint8
+
+const (
+ // elementNone occurs outside a special tag or special element body.
+ elementNone element = iota
+ // elementScript corresponds to the raw text <script> element.
+ elementScript
+ // elementStyle corresponds to the raw text <style> element.
+ elementStyle
+ // elementTextarea corresponds to the RCDATA <textarea> element.
+ elementTextarea
+ // elementTitle corresponds to the RCDATA <title> element.
+ elementTitle
+)
+
+var elementNames = [...]string{
+ elementNone: "elementNone",
+ elementScript: "elementScript",
+ elementStyle: "elementStyle",
+ elementTextarea: "elementTextarea",
+ elementTitle: "elementTitle",
+}
+
+func (e element) String() string {
+ if int(e) < len(elementNames) {
+ return elementNames[e]
+ }
+ return fmt.Sprintf("illegal element %d", e)
+}
s = append(s, "exp_template_html_jsstrescaper")
case stateJSRegexp:
s = append(s, "exp_template_html_jsregexpescaper")
- case stateJSBlockCmt, stateJSLineCmt, stateCSSBlockCmt, stateCSSLineCmt:
+ case stateComment, stateJSBlockCmt, stateJSLineCmt, stateCSSBlockCmt, stateCSSLineCmt:
return context{
state: stateError,
errLine: n.Line,
// Consume any quote.
i++
}
- c, s = context{state: stateTag}, s[i:]
+ // On exiting an attribute, we discard all state information
+ // except the state and element.
+ c, s = context{state: stateTag, element: c.element}, s[i:]
}
return c
}
var transitionFunc = [...]func(context, []byte) (context, []byte){
stateText: tText,
stateTag: tTag,
+ stateComment: tComment,
+ stateRCDATA: tSpecialTagEnd,
stateAttr: tAttr,
stateURL: tURL,
stateJS: tJS,
stateError: tError,
}
+var commentStart = []byte("<!--")
+var commentEnd = []byte("-->")
+
// tText is the context transition function for the text state.
func tText(c context, s []byte) (context, []byte) {
for {
i := bytes.IndexByte(s, '<')
if i == -1 || i+1 == len(s) {
return c, nil
+ } else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
+ return context{state: stateComment}, s[i+4:]
}
i++
if s[i] == '/' {
}
i++
}
- j := eatTagName(s, i)
+ j, e := eatTagName(s, i)
if j != i {
// We've found an HTML tag.
- return context{state: stateTag}, s[j:]
+ return context{state: stateTag, element: e}, s[j:]
}
s = s[j:]
}
panic("unreachable")
}
+var elementContentType = [...]state{
+ elementNone: stateText,
+ elementScript: stateJS,
+ elementStyle: stateCSS,
+ elementTextarea: stateRCDATA,
+ elementTitle: stateRCDATA,
+}
+
// tTag is the context transition function for the tag state.
func tTag(c context, s []byte) (context, []byte) {
// Find the attribute name.
}, nil
}
if i == len(s) {
- return context{state: stateTag}, nil
+ return c, nil
}
state := stateAttr
canonAttrName := strings.ToLower(string(s[attrStart:i]))
// Look for the start of the value.
i = eatWhiteSpace(s, i)
if i == len(s) {
- return context{state: stateTag}, s[i:]
+ return c, s[i:]
}
if s[i] == '>' {
- return context{state: stateText}, s[i+1:]
+ state = elementContentType[c.element]
+ return context{state: state, element: c.element}, s[i+1:]
} else if s[i] != '=' {
// Possible due to a valueless attribute or '/' in "<input />".
- return context{state: stateTag}, s[i:]
+ return c, s[i:]
}
// Consume the "=".
i = eatWhiteSpace(s, i+1)
}
}
- return context{state: state, delim: delim}, s[i:]
+ return context{state: state, delim: delim, element: c.element}, s[i:]
+}
+
+// tComment is the context transition function for stateComment.
+func tComment(c context, s []byte) (context, []byte) {
+ i := bytes.Index(s, commentEnd)
+ if i != -1 {
+ return context{}, s[i+3:]
+ }
+ return c, nil
+}
+
+// specialTagEndMarkers maps element types to the character sequence that
+// case-insensitively signals the end of the special tag body.
+var specialTagEndMarkers = [...]string{
+ elementScript: "</script",
+ elementStyle: "</style",
+ elementTextarea: "</textarea",
+ elementTitle: "</title",
+}
+
+// tSpecialTagEnd is the context transition function for raw text and RCDATA
+// element states.
+func tSpecialTagEnd(c context, s []byte) (context, []byte) {
+ if c.element != elementNone {
+ end := specialTagEndMarkers[c.element]
+ i := strings.Index(strings.ToLower(string(s)), end)
+ if i != -1 {
+ return context{state: stateTag}, s[i+len(end):]
+ }
+ }
+ return c, nil
}
// tAttr is the context transition function for the attribute state.
// tJS is the context transition function for the JS state.
func tJS(c context, s []byte) (context, []byte) {
- // TODO: delegate to tSpecialTagEnd to find any </script> once that CL
- // has been merged.
+ if d, t := tSpecialTagEnd(c, s); t != nil {
+ return d, t
+ }
i := bytes.IndexAny(s, `"'/`)
if i == -1 {
// tJSStr is the context transition function for the JS string states.
func tJSStr(c context, s []byte) (context, []byte) {
- // TODO: delegate to tSpecialTagEnd to find any </script> once that CL
- // has been merged.
+ if d, t := tSpecialTagEnd(c, s); t != nil {
+ return d, t
+ }
quoteAndEsc := `\"`
if c.state == stateJSSqStr {
// tJSRegexp is the context transition function for the /RegExp/ literal state.
func tJSRegexp(c context, s []byte) (context, []byte) {
- // TODO: delegate to tSpecialTagEnd to find any </script> once that CL
- // has been merged.
+ if d, t := tSpecialTagEnd(c, s); t != nil {
+ return d, t
+ }
b := s
inCharset := false
// tBlockCmt is the context transition function for /*comment*/ states.
func tBlockCmt(c context, s []byte) (context, []byte) {
- // TODO: look for </script or </style end tags.
+ if d, t := tSpecialTagEnd(c, s); t != nil {
+ return d, t
+ }
i := bytes.Index(s, blockCommentEnd)
if i == -1 {
return c, nil
// tLineCmt is the context transition function for //comment states.
func tLineCmt(c context, s []byte) (context, []byte) {
- // TODO: look for </script or </style end tags.
+ if d, t := tSpecialTagEnd(c, s); t != nil {
+ return d, t
+ }
var lineTerminators string
var endState state
switch c.state {
// tCSS is the context transition function for the CSS state.
func tCSS(c context, s []byte) (context, []byte) {
- // TODO: look for </style
+ if d, t := tSpecialTagEnd(c, s); t != nil {
+ return d, t
+ }
// CSS quoted strings are almost never used except for:
// (1) URLs as in background: "/foo.png"
// tCSSStr is the context transition function for the CSS string and URL states.
func tCSSStr(c context, s []byte) (context, []byte) {
- // TODO: look for </style
+ if d, t := tSpecialTagEnd(c, s); t != nil {
+ return d, t
+ }
var endAndEsc string
switch c.state {
return len(s), nil
}
-// eatTagName returns the largest j such that s[i:j] is a tag name.
-func eatTagName(s []byte, i int) int {
- for j := i; j < len(s); j++ {
+var elementNameMap = map[string]element{
+ "script": elementScript,
+ "style": elementStyle,
+ "textarea": elementTextarea,
+ "title": elementTitle,
+}
+
+// eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
+func eatTagName(s []byte, i int) (int, element) {
+ j := i
+ for ; j < len(s); j++ {
x := s[j]
- switch {
- case 'a' <= x && x <= 'z':
- // No-op.
- case 'A' <= x && x <= 'Z':
- // No-op.
- case '0' <= x && x <= '9' && i != j:
- // No-op.
- default:
- return j
+ if !(('a' <= x && x <= 'z') ||
+ ('A' <= x && x <= 'Z') ||
+ ('0' <= x && x <= '9' && i != j)) {
+ break
}
}
- return len(s)
+ return j, elementNameMap[strings.ToLower(string(s[i:j]))]
}
// eatWhiteSpace returns the largest j such that s[i:j] is white space.
"<button onclick='alert({{.A}})'>",
`<button onclick='alert(["\u003ca\u003e","\u003cb\u003e"])'>`,
},
+ {
+ "jsObjValueScript",
+ "<script>alert({{.A}})</script>",
+ `<script>alert(["\u003ca\u003e","\u003cb\u003e"])</script>`,
+ },
{
"jsObjValueNotOverEscaped",
"<button onclick='alert({{.A | html}})'>",
},
{
"styleIDPassed",
- `<style>p{{"#my-ID"}} { font: Arial }`,
- `<style>p#my-ID { font: Arial }`,
+ `<style>p{{"#my-ID"}} { font: Arial }</style>`,
+ `<style>p#my-ID { font: Arial }</style>`,
},
{
"styleClassPassed",
- `<style>p{{".my_class"}} { font: Arial }`,
- `<style>p.my_class { font: Arial }`,
+ `<style>p{{".my_class"}} { font: Arial }</style>`,
+ `<style>p.my_class { font: Arial }</style>`,
},
{
"styleQuantityPassed",
`<a style="background: '{{"http://oreilly.com/O'Reilly Animals(1)<2>;{}.html"}}'">`,
`<a style="background: 'http\3a\2f\2foreilly.com\2fO\27Reilly Animals\28 1\29\3c 2\3e\3b\7b\7d.html'">`,
},
+ {
+ "styleURLEncodedForHTMLInAttr",
+ `<a style="background: url('{{"/search?img=foo&size=icon"}}')">`,
+ `<a style="background: url('/search?img=foo&size=icon')">`,
+ },
+ {
+ "styleURLNotEncodedForHTMLInCdata",
+ `<style>body { background: url('{{"/search?img=foo&size=icon"}}') }</style>`,
+ `<style>body { background: url('/search?img=foo&size=icon') }</style>`,
+ },
{
"styleURLMixedCase",
`<p style="background: URL(#{{.H}})">`,
`<a style="border-image: url({{"/**/'\";:// \\"}}), url("{{"/**/'\";:// \\"}}"), url('{{"/**/'\";:// \\"}}'), 'http://www.example.com/?q={{"/**/'\";:// \\"}}''">`,
`<a style="border-image: url(/**/%27%22;://%20%5c), url("/**/%27%22;://%20%5c"), url('/**/%27%22;://%20%5c'), 'http://www.example.com/?q=%2f%2a%2a%2f%27%22%3b%3a%2f%2f%20%5c''">`,
},
+ {
+ "comment",
+ "<b>Hello, <!-- name of world -->{{.C}}</b>",
+ // TODO: Elide comment.
+ "<b>Hello, <!-- name of world --><Cincinatti></b>",
+ },
}
for _, test := range tests {
tmpl := template.Must(template.New(test.name).Parse(test.input))
- tmpl, err := Escape(tmpl)
+ tmpl = template.Must(Escape(tmpl))
b := new(bytes.Buffer)
- if err = tmpl.Execute(b, data); err != nil {
+ if err := tmpl.Execute(b, data); err != nil {
t.Errorf("%s: template execution failed: %s", test.name, err)
continue
}
"<a b=1 c={{.H}}",
"z ends in a non-text context: {stateAttr delimSpaceOrTagEnd",
},
+ {
+ "<script>foo();",
+ "z ends in a non-text context: {stateJS",
+ },
{
`<a href="{{if .F}}/foo?a={{else}}/bar/{{end}}{{.H}}">`,
"z:1: (action: [(command: [F=[H]])]) appears in an ambiguous URL context",
`<a style="// color: {{.X}}">`,
`z:1: (action: [(command: [F=[X]])]) appears inside a comment`,
},
+ {
+ "<!-- {{.H}} -->",
+ "z:1: (action: [(command: [F=[H]])]) appears inside a comment",
+ },
}
for _, test := range tests {
`<a style="background: url( x `,
context{state: stateCSS, delim: delimDoubleQuote},
},
+ {
+ `<!-- foo`,
+ context{state: stateComment},
+ },
+ {
+ `<!-->`,
+ context{state: stateComment},
+ },
+ {
+ `<!--->`,
+ context{state: stateComment},
+ },
+ {
+ `<!-- foo -->`,
+ context{state: stateText},
+ },
+ {
+ `<script`,
+ context{state: stateTag, element: elementScript},
+ },
+ {
+ `<script `,
+ context{state: stateTag, element: elementScript},
+ },
+ {
+ `<script src="foo.js" `,
+ context{state: stateTag, element: elementScript},
+ },
+ {
+ `<script src='foo.js' `,
+ context{state: stateTag, element: elementScript},
+ },
+ {
+ `<script type=text/javascript `,
+ context{state: stateTag, element: elementScript},
+ },
+ {
+ `<script>foo`,
+ context{state: stateJS, jsCtx: jsCtxDivOp, element: elementScript},
+ },
+ {
+ `<script>foo</script>`,
+ context{state: stateText},
+ },
+ {
+ `<script>foo</script><!--`,
+ context{state: stateComment},
+ },
+ {
+ `<script>document.write("<p>foo</p>");`,
+ context{state: stateJS, element: elementScript},
+ },
+ {
+ `<script>document.write("<p>foo<\/script>");`,
+ context{state: stateJS, element: elementScript},
+ },
+ {
+ `<script>document.write("<script>alert(1)</script>");`,
+ context{state: stateText},
+ },
+ {
+ `<Script>`,
+ context{state: stateJS, element: elementScript},
+ },
+ {
+ `<SCRIPT>foo`,
+ context{state: stateJS, jsCtx: jsCtxDivOp, element: elementScript},
+ },
+ {
+ `<textarea>value`,
+ context{state: stateRCDATA, element: elementTextarea},
+ },
+ {
+ `<textarea>value</TEXTAREA>`,
+ context{state: stateText},
+ },
+ {
+ `<textarea name=html><b`,
+ context{state: stateRCDATA, element: elementTextarea},
+ },
+ {
+ `<title>value`,
+ context{state: stateRCDATA, element: elementTitle},
+ },
+ {
+ `<style>value`,
+ context{state: stateCSS, element: elementStyle},
+ },
}
for _, test := range tests {