// For example, `<b>¡Hi!</b> <script>...</script>` -> `¡Hi! `.
func stripTags(html string) string {
var b bytes.Buffer
- s, c, i := []byte(html), context{}, 0
+ s, c, i, allText := []byte(html), context{}, 0, true
// Using the transition funcs helps us avoid mangling
// `<div title="1>2">` or `I <3 Ponies!`.
for i != len(s) {
if c.delim == delimNone {
- d, nread := transitionFunc[c.state](c, s[i:])
+ st := c.state
+ // Use RCDATA instead of parsing into JS or CSS styles.
+ if c.element != elementNone && !isInTag(st) {
+ st = stateRCDATA
+ }
+ d, nread := transitionFunc[st](c, s[i:])
i1 := i + nread
if c.state == stateText || c.state == stateRCDATA {
// Emit text up to the start of the tag or comment.
}
}
b.Write(s[i:j])
+ } else {
+ allText = false
}
c, i = d, i1
continue
}
c, i = context{state: stateTag, element: c.element}, i1
}
- if c.state == stateText {
- if b.Len() == 0 {
- return html
- }
+ if allText {
+ return html
+ } else if c.state == stateText || c.state == stateRCDATA {
b.Write(s[i:])
}
return b.String()
stateAttr: tAttr,
stateURL: tURL,
stateJS: tJS,
- stateJSDqStr: tJSStr,
- stateJSSqStr: tJSStr,
- stateJSRegexp: tJSRegexp,
+ stateJSDqStr: tJSDelimited,
+ stateJSSqStr: tJSDelimited,
+ stateJSRegexp: tJSDelimited,
stateJSBlockCmt: tBlockCmt,
stateJSLineCmt: tLineCmt,
stateCSS: tCSS,
return context{state: stateHTMLCmt}, i + 4
}
i++
+ end := false
if s[i] == '/' {
if i+1 == len(s) {
return c, len(s)
}
- i++
+ end, i = true, i+1
}
j, e := eatTagName(s, i)
if j != i {
+ if end {
+ e = elementNone
+ }
// We've found an HTML tag.
return context{state: stateTag, element: e}, j
}
i, err := eatAttrName(s, 0)
if err != nil {
return context{state: stateError, err: err}, len(s)
- } else if i == len(s) {
- return c, len(s)
+ } else if i != len(s) {
+ c.state = stateAfterName
}
- c.state = stateAfterName
return c, i
}
// tHTMLCmt is the context transition function for stateHTMLCmt.
func tHTMLCmt(c context, s []byte) (context, int) {
- i := bytes.Index(s, commentEnd)
- if i != -1 {
+ if i := bytes.Index(s, commentEnd); i != -1 {
return context{}, i + 3
}
return c, len(s)
// element states.
func tSpecialTagEnd(c context, s []byte) (context, int) {
if c.element != elementNone {
- end := specialTagEndMarkers[c.element]
- i := strings.Index(strings.ToLower(string(s)), end)
- if i != -1 {
- return context{state: stateTag}, i + len(end)
+ if i := strings.Index(strings.ToLower(string(s)), specialTagEndMarkers[c.element]); i != -1 {
+ return context{}, i
}
}
return c, len(s)
// tJS is the context transition function for the JS state.
func tJS(c context, s []byte) (context, int) {
- if d, i := tSpecialTagEnd(c, s); i != len(s) {
- return d, i
- }
-
i := bytes.IndexAny(s, `"'/`)
if i == -1 {
// Entire input is non string, comment, regexp tokens.
return c, i + 1
}
-// tJSStr is the context transition function for the JS string states.
-func tJSStr(c context, s []byte) (context, int) {
- if d, i := tSpecialTagEnd(c, s); i != len(s) {
- return d, i
- }
-
- quoteAndEsc := `\"`
- if c.state == stateJSSqStr {
- quoteAndEsc = `\'`
- }
-
- k := 0
- for {
- i := k + bytes.IndexAny(s[k:], quoteAndEsc)
- if i < k {
- return c, len(s)
- }
- if s[i] == '\\' {
- i++
- if i == len(s) {
- return context{
- state: stateError,
- err: errorf(ErrPartialEscape, 0, "unfinished escape sequence in JS string: %q", s),
- }, len(s)
- }
- } else {
- c.state, c.jsCtx = stateJS, jsCtxDivOp
- return c, i + 1
- }
- k = i + 1
- }
- panic("unreachable")
-}
-
-// tJSRegexp is the context transition function for the /RegExp/ literal state.
-func tJSRegexp(c context, s []byte) (context, int) {
- if d, i := tSpecialTagEnd(c, s); i != len(s) {
- return d, i
+// tJSDelimited is the context transition function for the JS string and regexp
+// states.
+func tJSDelimited(c context, s []byte) (context, int) {
+ specials := `\"`
+ switch c.state {
+ case stateJSSqStr:
+ specials = `\'`
+ case stateJSRegexp:
+ specials = `\/[]`
}
k, inCharset := 0, false
for {
- i := k + bytes.IndexAny(s[k:], `\/[]`)
+ i := k + bytes.IndexAny(s[k:], specials)
if i < k {
break
}
switch s[i] {
- case '/':
- if !inCharset {
- c.state, c.jsCtx = stateJS, jsCtxDivOp
- return c, i + 1
- }
case '\\':
i++
if i == len(s) {
return context{
state: stateError,
- err: errorf(ErrPartialEscape, 0, "unfinished escape sequence in JS regexp: %q", s),
+ err: errorf(ErrPartialEscape, 0, "unfinished escape sequence in JS string: %q", s),
}, len(s)
}
case '[':
case ']':
inCharset = false
default:
- panic("unreachable")
+ // end delimiter
+ if !inCharset {
+ c.state, c.jsCtx = stateJS, jsCtxDivOp
+ return c, i + 1
+ }
}
k = i + 1
}
// tBlockCmt is the context transition function for /*comment*/ states.
func tBlockCmt(c context, s []byte) (context, int) {
- if d, i := tSpecialTagEnd(c, s); i != len(s) {
- return d, i
- }
i := bytes.Index(s, blockCommentEnd)
if i == -1 {
return c, len(s)
// tLineCmt is the context transition function for //comment states.
func tLineCmt(c context, s []byte) (context, int) {
- if d, i := tSpecialTagEnd(c, s); i != len(s) {
- return d, i
- }
var lineTerminators string
var endState state
switch c.state {
// tCSS is the context transition function for the CSS state.
func tCSS(c context, s []byte) (context, int) {
- if d, i := tSpecialTagEnd(c, s); i != len(s) {
- return d, i
- }
-
// CSS quoted strings are almost never used except for:
// (1) URLs as in background: "/foo.png"
// (2) Multiword font-names as in font-family: "Times New Roman"
// tCSSStr is the context transition function for the CSS string and URL states.
func tCSSStr(c context, s []byte) (context, int) {
- if d, i := tSpecialTagEnd(c, s); i != len(s) {
- return d, i
- }
-
var endAndEsc string
switch c.state {
case stateCSSDqStr, stateCSSDqURL: