}
if pv, ok := val.(*reflect.PtrValue); ok {
- zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
- pv.PointTo(zv);
- val = zv;
+ if pv.Get() == 0 {
+ zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
+ pv.PointTo(zv);
+ val = zv;
+ } else {
+ val = pv.Elem();
+ }
}
var (
data []byte;
saveData reflect.Value;
+ comment []byte;
+ saveComment reflect.Value;
sv *reflect.StructValue;
styp *reflect.StructType;
)
}
// Assign attributes.
- // Also, determine whether we need to save character data.
+ // Also, determine whether we need to save character data or comments.
for i, n := 0, typ.NumField(); i < n; i++ {
f := typ.Field(i);
switch f.Tag {
}
strv.Set(val);
+ case "comment":
+ if saveComment == nil {
+ saveComment = sv.FieldByIndex(f.Index);
+ }
+
case "chardata":
if saveData == nil {
saveData = sv.FieldByIndex(f.Index);
if saveData != nil {
data = bytes.Add(data, t);
}
+
+ case Comment:
+ if saveComment != nil {
+ comment = bytes.Add(comment, t);
+ }
}
}
- // Save accumulated character data
- if saveData != nil {
- switch t := saveData.(type) {
- case *reflect.StringValue:
- t.Set(string(data));
- case *reflect.SliceValue:
- t.Set(reflect.NewValue(data).(*reflect.SliceValue));
- }
+ // Save accumulated character data and comments
+ switch t := saveData.(type) {
+ case *reflect.StringValue:
+ t.Set(string(data));
+ case *reflect.SliceValue:
+ t.Set(reflect.NewValue(data).(*reflect.SliceValue));
+ }
+
+ switch t := saveComment.(type) {
+ case *reflect.StringValue:
+ t.Set(string(comment));
+ case *reflect.SliceValue:
+ t.Set(reflect.NewValue(comment).(*reflect.SliceValue));
}
return nil;
// A Parser represents an XML parser reading a particular input stream.
// The parser assumes that its input is encoded in UTF-8.
type Parser struct {
+ // Strict defaults to true, enforcing the requirements
+ // of the XML specification.
+ // If set to false, the parser allows input containing common
+ // mistakes:
+ // * If an element is missing an end tag, the parser invents
+ // end tags as necessary to keep the return values from Token
+ // properly balanced.
+ // * In attribute values and character data, unknown or malformed
+ // character entities (sequences beginning with &) are left alone.
+ //
+ // Setting:
+ //
+ // p.Strict = false;
+ // p.AutoClose = HTMLAutoClose;
+ // p.Entity = HTMLEntity
+ //
+ // creates a parser that can handle typical HTML.
+ Strict bool;
+
+ // When Strict == false, AutoClose indicates a set of elements to
+ // consider closed immediately after they are opened, regardless
+ // of whether an end element is present.
+ AutoClose []string;
+
+ // Entity can be used to map non-standard entity names to string replacements.
+ // The parser behaves as if these standard mappings are present in the map,
+ // regardless of the actual map content:
+ //
+ // "lt": "<",
+ // "gt": ">",
+ // "amp": "&",
+ // "pos": "'",
+ // "quot": `"`,
+ //
+ Entity map[string]string;
+
r readByter;
buf bytes.Buffer;
stk *stack;
free *stack;
needClose bool;
toClose Name;
+ nextToken Token;
nextByte int;
ns map[string]string;
err os.Error;
ns: make(map[string]string),
nextByte: -1,
line: 1,
+ Strict: true,
};
// Get efficient byte at a time reader.
// it uses the prefix as the Space rather than report an error.
//
func (p *Parser) Token() (t Token, err os.Error) {
- if t, err = p.RawToken(); err != nil {
+ if p.nextToken != nil {
+ t = p.nextToken;
+ p.nextToken = nil;
+ } else if t, err = p.RawToken(); err != nil {
return;
}
+
+ if !p.Strict {
+ if t1, ok := p.autoClose(t); ok {
+ p.nextToken = t;
+ t = t1;
+ }
+ }
switch t1 := t.(type) {
case StartElement:
// In XML name spaces, the translations listed in the
case EndElement:
p.translate(&t1.Name, true);
- if !p.popElement(t1.Name) {
+ if !p.popElement(&t1) {
return nil, p.err;
}
t = t1;
// After popping the element, apply any undo records from
// the stack to restore the name translations that existed
// before we saw this element.
-func (p *Parser) popElement(name Name) bool {
+func (p *Parser) popElement(t *EndElement) bool {
s := p.pop();
+ name := t.Name;
switch {
case s == nil || s.kind != stkStart:
p.err = SyntaxError("unexpected end element </" + name.Local + ">");
return false;
case s.name.Local != name.Local:
+ if !p.Strict {
+ p.needClose = true;
+ p.toClose = t.Name;
+ t.Name = s.name;
+ return true;
+ }
p.err = SyntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">");
return false;
case s.name.Space != name.Space:
return true;
}
+// If the top element on the stack is autoclosing and
+// t is not the end tag, invent the end tag.
+func (p *Parser) autoClose(t Token) (Token, bool) {
+ if p.stk == nil || p.stk.kind != stkStart {
+ return nil, false;
+ }
+ name := strings.ToLower(p.stk.name.Local);
+ for _, s := range p.AutoClose {
+ if strings.ToLower(s) == name {
+ // This one should be auto closed if t doesn't close it.
+ et, ok := t.(EndElement);
+ if !ok || et.Name.Local != name {
+ return EndElement{p.stk.name}, true;
+ }
+ break;
+ }
+ }
+ return nil, false;
+}
+
+
// RawToken is like Token but does not verify that
// start and end elements match and does not translate
// name space prefixes to their corresponding URLs.
// Parsers are required to recognize lt, gt, amp, apos, and quot
// even if they have not been declared. That's all we allow.
var i int;
+ CharLoop:
for i = 0; i < len(p.tmp); i++ {
p.tmp[i], p.err = p.r.ReadByte();
if p.err != nil {
return nil;
}
- if p.tmp[i] == ';' {
+ c := p.tmp[i];
+ if c == ';' {
break;
}
+ if 'a' <= c && c <= 'z' ||
+ 'A' <= c && c <= 'Z' ||
+ '0' <= c && c <= '9' ||
+ c == '_' || c == '#' {
+ continue;
+ }
+ p.ungetc(c);
+ break;
}
s := string(p.tmp[0:i]);
if i >= len(p.tmp) {
+ if !p.Strict {
+ b0, b1 = 0, 0;
+ p.buf.WriteByte('&');
+ p.buf.Write(p.tmp[0:i]);
+ continue Input;
+ }
p.err = SyntaxError("character entity expression &" + s + "... too long");
return nil;
}
- rune := -1;
+ var haveText bool;
+ var text string;
if i >= 2 && s[0] == '#' {
var n uint64;
var err os.Error;
n, err = strconv.Btoui64(s[1:len(s)], 10);
}
if err == nil && n <= unicode.MaxRune {
- rune = int(n);
+ text = string(n);
+ haveText = true;
}
} else {
if r, ok := entity[s]; ok {
- rune = r;
+ text = string(r);
+ haveText = true;
+ } else {
+ text, haveText = p.Entity[s];
}
}
- if rune < 0 {
+ if !haveText {
+ if !p.Strict {
+ b0, b1 = 0, 0;
+ p.buf.WriteByte('&');
+ p.buf.Write(p.tmp[0:i]);
+ continue Input;
+ }
p.err = SyntaxError("invalid character entity &" + s + ";");
return nil;
}
- i = utf8.EncodeRune(rune, &p.tmp);
- p.buf.Write(p.tmp[0:i]);
+ p.buf.Write(strings.Bytes(text));
b0, b1 = 0, 0;
continue Input;
}
func isNameByte(c byte) bool {
return 'A' <= c && c <= 'Z' ||
'a' <= c && c <= 'z' ||
+ '0' <= c && c <= '9' ||
c == '_' || c == ':' || c == '.' || c == '-';
}
unicode.Range{0x309D, 0x309E, 1},
unicode.Range{0x30FC, 0x30FE, 1},
}
+
+// HTMLEntity is an entity map containing translations for the
+// standard HTML entity characters.
+var HTMLEntity = htmlEntity
+
+var htmlEntity = map[string]string {
+/*
+ hget http://www.w3.org/TR/html4/sgml/entities.html |
+ ssam '
+ ,y /\>/ x/\<(.|\n)+/ s/\n/ /g
+ ,x v/^\<!ENTITY/d
+ ,s/\<!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/ "\1": "\\u\2",/g
+ '
+*/
+ "nbsp": "\u00A0",
+ "iexcl": "\u00A1",
+ "cent": "\u00A2",
+ "pound": "\u00A3",
+ "curren": "\u00A4",
+ "yen": "\u00A5",
+ "brvbar": "\u00A6",
+ "sect": "\u00A7",
+ "uml": "\u00A8",
+ "copy": "\u00A9",
+ "ordf": "\u00AA",
+ "laquo": "\u00AB",
+ "not": "\u00AC",
+ "shy": "\u00AD",
+ "reg": "\u00AE",
+ "macr": "\u00AF",
+ "deg": "\u00B0",
+ "plusmn": "\u00B1",
+ "sup2": "\u00B2",
+ "sup3": "\u00B3",
+ "acute": "\u00B4",
+ "micro": "\u00B5",
+ "para": "\u00B6",
+ "middot": "\u00B7",
+ "cedil": "\u00B8",
+ "sup1": "\u00B9",
+ "ordm": "\u00BA",
+ "raquo": "\u00BB",
+ "frac14": "\u00BC",
+ "frac12": "\u00BD",
+ "frac34": "\u00BE",
+ "iquest": "\u00BF",
+ "Agrave": "\u00C0",
+ "Aacute": "\u00C1",
+ "Acirc": "\u00C2",
+ "Atilde": "\u00C3",
+ "Auml": "\u00C4",
+ "Aring": "\u00C5",
+ "AElig": "\u00C6",
+ "Ccedil": "\u00C7",
+ "Egrave": "\u00C8",
+ "Eacute": "\u00C9",
+ "Ecirc": "\u00CA",
+ "Euml": "\u00CB",
+ "Igrave": "\u00CC",
+ "Iacute": "\u00CD",
+ "Icirc": "\u00CE",
+ "Iuml": "\u00CF",
+ "ETH": "\u00D0",
+ "Ntilde": "\u00D1",
+ "Ograve": "\u00D2",
+ "Oacute": "\u00D3",
+ "Ocirc": "\u00D4",
+ "Otilde": "\u00D5",
+ "Ouml": "\u00D6",
+ "times": "\u00D7",
+ "Oslash": "\u00D8",
+ "Ugrave": "\u00D9",
+ "Uacute": "\u00DA",
+ "Ucirc": "\u00DB",
+ "Uuml": "\u00DC",
+ "Yacute": "\u00DD",
+ "THORN": "\u00DE",
+ "szlig": "\u00DF",
+ "agrave": "\u00E0",
+ "aacute": "\u00E1",
+ "acirc": "\u00E2",
+ "atilde": "\u00E3",
+ "auml": "\u00E4",
+ "aring": "\u00E5",
+ "aelig": "\u00E6",
+ "ccedil": "\u00E7",
+ "egrave": "\u00E8",
+ "eacute": "\u00E9",
+ "ecirc": "\u00EA",
+ "euml": "\u00EB",
+ "igrave": "\u00EC",
+ "iacute": "\u00ED",
+ "icirc": "\u00EE",
+ "iuml": "\u00EF",
+ "eth": "\u00F0",
+ "ntilde": "\u00F1",
+ "ograve": "\u00F2",
+ "oacute": "\u00F3",
+ "ocirc": "\u00F4",
+ "otilde": "\u00F5",
+ "ouml": "\u00F6",
+ "divide": "\u00F7",
+ "oslash": "\u00F8",
+ "ugrave": "\u00F9",
+ "uacute": "\u00FA",
+ "ucirc": "\u00FB",
+ "uuml": "\u00FC",
+ "yacute": "\u00FD",
+ "thorn": "\u00FE",
+ "yuml": "\u00FF",
+ "fnof": "\u0192",
+ "Alpha": "\u0391",
+ "Beta": "\u0392",
+ "Gamma": "\u0393",
+ "Delta": "\u0394",
+ "Epsilon": "\u0395",
+ "Zeta": "\u0396",
+ "Eta": "\u0397",
+ "Theta": "\u0398",
+ "Iota": "\u0399",
+ "Kappa": "\u039A",
+ "Lambda": "\u039B",
+ "Mu": "\u039C",
+ "Nu": "\u039D",
+ "Xi": "\u039E",
+ "Omicron": "\u039F",
+ "Pi": "\u03A0",
+ "Rho": "\u03A1",
+ "Sigma": "\u03A3",
+ "Tau": "\u03A4",
+ "Upsilon": "\u03A5",
+ "Phi": "\u03A6",
+ "Chi": "\u03A7",
+ "Psi": "\u03A8",
+ "Omega": "\u03A9",
+ "alpha": "\u03B1",
+ "beta": "\u03B2",
+ "gamma": "\u03B3",
+ "delta": "\u03B4",
+ "epsilon": "\u03B5",
+ "zeta": "\u03B6",
+ "eta": "\u03B7",
+ "theta": "\u03B8",
+ "iota": "\u03B9",
+ "kappa": "\u03BA",
+ "lambda": "\u03BB",
+ "mu": "\u03BC",
+ "nu": "\u03BD",
+ "xi": "\u03BE",
+ "omicron": "\u03BF",
+ "pi": "\u03C0",
+ "rho": "\u03C1",
+ "sigmaf": "\u03C2",
+ "sigma": "\u03C3",
+ "tau": "\u03C4",
+ "upsilon": "\u03C5",
+ "phi": "\u03C6",
+ "chi": "\u03C7",
+ "psi": "\u03C8",
+ "omega": "\u03C9",
+ "thetasym": "\u03D1",
+ "upsih": "\u03D2",
+ "piv": "\u03D6",
+ "bull": "\u2022",
+ "hellip": "\u2026",
+ "prime": "\u2032",
+ "Prime": "\u2033",
+ "oline": "\u203E",
+ "frasl": "\u2044",
+ "weierp": "\u2118",
+ "image": "\u2111",
+ "real": "\u211C",
+ "trade": "\u2122",
+ "alefsym": "\u2135",
+ "larr": "\u2190",
+ "uarr": "\u2191",
+ "rarr": "\u2192",
+ "darr": "\u2193",
+ "harr": "\u2194",
+ "crarr": "\u21B5",
+ "lArr": "\u21D0",
+ "uArr": "\u21D1",
+ "rArr": "\u21D2",
+ "dArr": "\u21D3",
+ "hArr": "\u21D4",
+ "forall": "\u2200",
+ "part": "\u2202",
+ "exist": "\u2203",
+ "empty": "\u2205",
+ "nabla": "\u2207",
+ "isin": "\u2208",
+ "notin": "\u2209",
+ "ni": "\u220B",
+ "prod": "\u220F",
+ "sum": "\u2211",
+ "minus": "\u2212",
+ "lowast": "\u2217",
+ "radic": "\u221A",
+ "prop": "\u221D",
+ "infin": "\u221E",
+ "ang": "\u2220",
+ "and": "\u2227",
+ "or": "\u2228",
+ "cap": "\u2229",
+ "cup": "\u222A",
+ "int": "\u222B",
+ "there4": "\u2234",
+ "sim": "\u223C",
+ "cong": "\u2245",
+ "asymp": "\u2248",
+ "ne": "\u2260",
+ "equiv": "\u2261",
+ "le": "\u2264",
+ "ge": "\u2265",
+ "sub": "\u2282",
+ "sup": "\u2283",
+ "nsub": "\u2284",
+ "sube": "\u2286",
+ "supe": "\u2287",
+ "oplus": "\u2295",
+ "otimes": "\u2297",
+ "perp": "\u22A5",
+ "sdot": "\u22C5",
+ "lceil": "\u2308",
+ "rceil": "\u2309",
+ "lfloor": "\u230A",
+ "rfloor": "\u230B",
+ "lang": "\u2329",
+ "rang": "\u232A",
+ "loz": "\u25CA",
+ "spades": "\u2660",
+ "clubs": "\u2663",
+ "hearts": "\u2665",
+ "diams": "\u2666",
+ "quot": "\u0022",
+ "amp": "\u0026",
+ "lt": "\u003C",
+ "gt": "\u003E",
+ "OElig": "\u0152",
+ "oelig": "\u0153",
+ "Scaron": "\u0160",
+ "scaron": "\u0161",
+ "Yuml": "\u0178",
+ "circ": "\u02C6",
+ "tilde": "\u02DC",
+ "ensp": "\u2002",
+ "emsp": "\u2003",
+ "thinsp": "\u2009",
+ "zwnj": "\u200C",
+ "zwj": "\u200D",
+ "lrm": "\u200E",
+ "rlm": "\u200F",
+ "ndash": "\u2013",
+ "mdash": "\u2014",
+ "lsquo": "\u2018",
+ "rsquo": "\u2019",
+ "sbquo": "\u201A",
+ "ldquo": "\u201C",
+ "rdquo": "\u201D",
+ "bdquo": "\u201E",
+ "dagger": "\u2020",
+ "Dagger": "\u2021",
+ "permil": "\u2030",
+ "lsaquo": "\u2039",
+ "rsaquo": "\u203A",
+ "euro": "\u20AC",
+}
+
+// HTMLAutoClose is the set of HTML elements that
+// should be considered to close automatically.
+var HTMLAutoClose = htmlAutoClose
+
+var htmlAutoClose = []string {
+/*
+ hget http://www.w3.org/TR/html4/loose.dtd |
+ 9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/ "\1",/p' | tr A-Z a-z
+*/
+ "basefont",
+ "br",
+ "area",
+ "link",
+ "img",
+ "param",
+ "hr",
+ "input",
+ "col ",
+ "frame",
+ "isindex",
+ "base",
+ "meta",
+}