]> Cypherpunks repositories - gostls13.git/commitdiff
introduce non-strict mode in xml parser,
authorRuss Cox <rsc@golang.org>
Mon, 12 Oct 2009 06:51:46 +0000 (23:51 -0700)
committerRuss Cox <rsc@golang.org>
Mon, 12 Oct 2009 06:51:46 +0000 (23:51 -0700)
good enough to parse some html.

in reader, add "comment" tag to collect
comment text.

do not allocate during Unmarshal unless pointer is nil.

R=r
DELTA=441  (416 added, 1 deleted, 24 changed)
OCL=35586
CL=35594

src/pkg/xml/read.go
src/pkg/xml/xml.go

index e2d07b913b1b473f043b0e9b8b8073ec936a4b1a..3671c45342ace775c7cbf76e8014ffe106b591fb 100644 (file)
@@ -162,14 +162,20 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
        }
 
        if pv, ok := val.(*reflect.PtrValue); ok {
-               zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
-               pv.PointTo(zv);
-               val = zv;
+               if pv.Get() == 0 {
+                       zv := reflect.MakeZero(pv.Type().(*reflect.PtrType).Elem());
+                       pv.PointTo(zv);
+                       val = zv;
+               } else {
+                       val = pv.Elem();
+               }
        }
 
        var (
                data []byte;
                saveData reflect.Value;
+               comment []byte;
+               saveComment reflect.Value;
                sv *reflect.StructValue;
                styp *reflect.StructType;
        )
@@ -251,7 +257,7 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
                }
 
                // Assign attributes.
-               // Also, determine whether we need to save character data.
+               // Also, determine whether we need to save character data or comments.
                for i, n := 0, typ.NumField(); i < n; i++ {
                        f := typ.Field(i);
                        switch f.Tag {
@@ -271,6 +277,11 @@ func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error {
                                }
                                strv.Set(val);
 
+                       case "comment":
+                               if saveComment == nil {
+                                       saveComment = sv.FieldByIndex(f.Index);
+                               }
+
                        case "chardata":
                                if saveData == nil {
                                        saveData = sv.FieldByIndex(f.Index);
@@ -326,17 +337,27 @@ Loop:
                        if saveData != nil {
                                data = bytes.Add(data, t);
                        }
+
+               case Comment:
+                       if saveComment != nil {
+                               comment = bytes.Add(comment, t);
+                       }
                }
        }
 
-       // Save accumulated character data
-       if saveData != nil {
-               switch t := saveData.(type) {
-               case *reflect.StringValue:
-                       t.Set(string(data));
-               case *reflect.SliceValue:
-                       t.Set(reflect.NewValue(data).(*reflect.SliceValue));
-               }
+       // Save accumulated character data and comments
+       switch t := saveData.(type) {
+       case *reflect.StringValue:
+               t.Set(string(data));
+       case *reflect.SliceValue:
+               t.Set(reflect.NewValue(data).(*reflect.SliceValue));
+       }
+
+       switch t := saveComment.(type) {
+       case *reflect.StringValue:
+               t.Set(string(comment));
+       case *reflect.SliceValue:
+               t.Set(reflect.NewValue(comment).(*reflect.SliceValue));
        }
 
        return nil;
index 5baaafbb3882b30c6a173fa268aa6ee753599e00..d73ebdc2ae61700205317c402adeb36c5f716478 100644 (file)
@@ -111,12 +111,49 @@ type readByter interface {
 // A Parser represents an XML parser reading a particular input stream.
 // The parser assumes that its input is encoded in UTF-8.
 type Parser struct {
+       // Strict defaults to true, enforcing the requirements
+       // of the XML specification.
+       // If set to false, the parser allows input containing common
+       // mistakes:
+       //      * If an element is missing an end tag, the parser invents
+       //        end tags as necessary to keep the return values from Token
+       //        properly balanced.
+       //      * In attribute values and character data, unknown or malformed
+       //        character entities (sequences beginning with &) are left alone.
+       //
+       // Setting:
+       //
+       //      p.Strict = false;
+       //      p.AutoClose = HTMLAutoClose;
+       //      p.Entity = HTMLEntity
+       //
+       // creates a parser that can handle typical HTML.
+       Strict  bool;
+
+       // When Strict == false, AutoClose indicates a set of elements to
+       // consider closed immediately after they are opened, regardless
+       // of whether an end element is present.
+       AutoClose       []string;
+
+       // Entity can be used to map non-standard entity names to string replacements.
+       // The parser behaves as if these standard mappings are present in the map,
+       // regardless of the actual map content:
+       //
+       //      "lt": "<",
+       //      "gt": ">",
+       //      "amp": "&",
+       //      "pos": "'",
+       //      "quot": `"`,
+       //
+       Entity  map[string]string;
+
        r               readByter;
        buf             bytes.Buffer;
        stk             *stack;
        free            *stack;
        needClose       bool;
        toClose         Name;
+       nextToken       Token;
        nextByte        int;
        ns              map[string]string;
        err             os.Error;
@@ -130,6 +167,7 @@ func NewParser(r io.Reader) *Parser {
                ns: make(map[string]string),
                nextByte: -1,
                line: 1,
+               Strict: true,
        };
 
        // Get efficient byte at a time reader.
@@ -169,9 +207,19 @@ func NewParser(r io.Reader) *Parser {
 // it uses the prefix as the Space rather than report an error.
 //
 func (p *Parser) Token() (t Token, err os.Error) {
-       if t, err = p.RawToken(); err != nil {
+       if p.nextToken != nil {
+               t = p.nextToken;
+               p.nextToken = nil;
+       } else if t, err = p.RawToken(); err != nil {
                return;
        }
+
+       if !p.Strict {
+               if t1, ok := p.autoClose(t); ok {
+                       p.nextToken = t;
+                       t = t1;
+               }
+       }
        switch t1 := t.(type) {
        case StartElement:
                // In XML name spaces, the translations listed in the
@@ -201,7 +249,7 @@ func (p *Parser) Token() (t Token, err os.Error) {
 
        case EndElement:
                p.translate(&t1.Name, true);
-               if !p.popElement(t1.Name) {
+               if !p.popElement(&t1) {
                        return nil, p.err;
                }
                t = t1;
@@ -286,13 +334,20 @@ func (p *Parser) pushNs(local string, url string, ok bool) {
 // After popping the element, apply any undo records from
 // the stack to restore the name translations that existed
 // before we saw this element.
-func (p *Parser) popElement(name Name) bool {
+func (p *Parser) popElement(t *EndElement) bool {
        s := p.pop();
+       name := t.Name;
        switch {
        case s == nil || s.kind != stkStart:
                p.err = SyntaxError("unexpected end element </" + name.Local + ">");
                return false;
        case s.name.Local != name.Local:
+               if !p.Strict {
+                       p.needClose = true;
+                       p.toClose = t.Name;
+                       t.Name = s.name;
+                       return true;
+               }
                p.err = SyntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">");
                return false;
        case s.name.Space != name.Space:
@@ -311,6 +366,27 @@ func (p *Parser) popElement(name Name) bool {
        return true;
 }
 
+// If the top element on the stack is autoclosing and
+// t is not the end tag, invent the end tag.
+func (p *Parser) autoClose(t Token) (Token, bool) {
+       if p.stk == nil || p.stk.kind != stkStart {
+               return nil, false;
+       }
+       name := strings.ToLower(p.stk.name.Local);
+       for _, s := range p.AutoClose {
+               if strings.ToLower(s) == name {
+                       // This one should be auto closed if t doesn't close it.
+                       et, ok := t.(EndElement);
+                       if !ok || et.Name.Local != name {
+                               return EndElement{p.stk.name}, true;
+                       }
+                       break;
+               }
+       }
+       return nil, false;
+}
+
+
 // RawToken is like Token but does not verify that
 // start and end elements match and does not translate
 // name space prefixes to their corresponding URLs.
@@ -645,21 +721,38 @@ Input:
                        // Parsers are required to recognize lt, gt, amp, apos, and quot
                        // even if they have not been declared.  That's all we allow.
                        var i int;
+               CharLoop:
                        for i = 0; i < len(p.tmp); i++ {
                                p.tmp[i], p.err = p.r.ReadByte();
                                if p.err != nil {
                                        return nil;
                                }
-                               if p.tmp[i] == ';' {
+                               c := p.tmp[i];
+                               if c == ';' {
                                        break;
                                }
+                               if 'a' <= c && c <= 'z' ||
+                                       'A' <= c && c <= 'Z' ||
+                                       '0' <= c && c <= '9' ||
+                                       c == '_' || c == '#' {
+                                       continue;
+                               }
+                               p.ungetc(c);
+                               break;
                        }
                        s := string(p.tmp[0:i]);
                        if i >= len(p.tmp) {
+                               if !p.Strict {
+                                       b0, b1 = 0, 0;
+                                       p.buf.WriteByte('&');
+                                       p.buf.Write(p.tmp[0:i]);
+                                       continue Input;
+                               }
                                p.err = SyntaxError("character entity expression &" + s + "... too long");
                                return nil;
                        }
-                       rune := -1;
+                       var haveText bool;
+                       var text string;
                        if i >= 2 && s[0] == '#' {
                                var n uint64;
                                var err os.Error;
@@ -669,19 +762,28 @@ Input:
                                        n, err = strconv.Btoui64(s[1:len(s)], 10);
                                }
                                if err == nil && n <= unicode.MaxRune {
-                                       rune = int(n);
+                                       text = string(n);
+                                       haveText = true;
                                }
                        } else {
                                if r, ok := entity[s]; ok {
-                                       rune = r;
+                                       text = string(r);
+                                       haveText = true;
+                               } else {
+                                       text, haveText = p.Entity[s];
                                }
                        }
-                       if rune < 0 {
+                       if !haveText {
+                               if !p.Strict {
+                                       b0, b1 = 0, 0;
+                                       p.buf.WriteByte('&');
+                                       p.buf.Write(p.tmp[0:i]);
+                                       continue Input;
+                               }
                                p.err = SyntaxError("invalid character entity &" + s + ";");
                                return nil;
                        }
-                       i = utf8.EncodeRune(rune, &p.tmp);
-                       p.buf.Write(p.tmp[0:i]);
+                       p.buf.Write(strings.Bytes(text));
                        b0, b1 = 0, 0;
                        continue Input;
                }
@@ -764,6 +866,7 @@ func (p *Parser) name() (s string, ok bool) {
 func isNameByte(c byte) bool {
        return 'A' <= c && c <= 'Z' ||
                'a' <= c && c <= 'z' ||
+               '0' <= c && c <= '9' ||
                c == '_' || c == ':' || c == '.' || c == '-';
 }
 
@@ -1079,3 +1182,294 @@ var second = []unicode.Range{
        unicode.Range{0x309D, 0x309E, 1},
        unicode.Range{0x30FC, 0x30FE, 1},
 }
+
+// HTMLEntity is an entity map containing translations for the
+// standard HTML entity characters.
+var HTMLEntity = htmlEntity
+
+var htmlEntity = map[string]string {
+/*
+       hget http://www.w3.org/TR/html4/sgml/entities.html |
+       ssam '
+               ,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
+               ,x v/^\&lt;!ENTITY/d
+               ,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/     "\1": "\\u\2",/g
+       '
+*/
+       "nbsp": "\u00A0",
+       "iexcl": "\u00A1",
+       "cent": "\u00A2",
+       "pound": "\u00A3",
+       "curren": "\u00A4",
+       "yen": "\u00A5",
+       "brvbar": "\u00A6",
+       "sect": "\u00A7",
+       "uml": "\u00A8",
+       "copy": "\u00A9",
+       "ordf": "\u00AA",
+       "laquo": "\u00AB",
+       "not": "\u00AC",
+       "shy": "\u00AD",
+       "reg": "\u00AE",
+       "macr": "\u00AF",
+       "deg": "\u00B0",
+       "plusmn": "\u00B1",
+       "sup2": "\u00B2",
+       "sup3": "\u00B3",
+       "acute": "\u00B4",
+       "micro": "\u00B5",
+       "para": "\u00B6",
+       "middot": "\u00B7",
+       "cedil": "\u00B8",
+       "sup1": "\u00B9",
+       "ordm": "\u00BA",
+       "raquo": "\u00BB",
+       "frac14": "\u00BC",
+       "frac12": "\u00BD",
+       "frac34": "\u00BE",
+       "iquest": "\u00BF",
+       "Agrave": "\u00C0",
+       "Aacute": "\u00C1",
+       "Acirc": "\u00C2",
+       "Atilde": "\u00C3",
+       "Auml": "\u00C4",
+       "Aring": "\u00C5",
+       "AElig": "\u00C6",
+       "Ccedil": "\u00C7",
+       "Egrave": "\u00C8",
+       "Eacute": "\u00C9",
+       "Ecirc": "\u00CA",
+       "Euml": "\u00CB",
+       "Igrave": "\u00CC",
+       "Iacute": "\u00CD",
+       "Icirc": "\u00CE",
+       "Iuml": "\u00CF",
+       "ETH": "\u00D0",
+       "Ntilde": "\u00D1",
+       "Ograve": "\u00D2",
+       "Oacute": "\u00D3",
+       "Ocirc": "\u00D4",
+       "Otilde": "\u00D5",
+       "Ouml": "\u00D6",
+       "times": "\u00D7",
+       "Oslash": "\u00D8",
+       "Ugrave": "\u00D9",
+       "Uacute": "\u00DA",
+       "Ucirc": "\u00DB",
+       "Uuml": "\u00DC",
+       "Yacute": "\u00DD",
+       "THORN": "\u00DE",
+       "szlig": "\u00DF",
+       "agrave": "\u00E0",
+       "aacute": "\u00E1",
+       "acirc": "\u00E2",
+       "atilde": "\u00E3",
+       "auml": "\u00E4",
+       "aring": "\u00E5",
+       "aelig": "\u00E6",
+       "ccedil": "\u00E7",
+       "egrave": "\u00E8",
+       "eacute": "\u00E9",
+       "ecirc": "\u00EA",
+       "euml": "\u00EB",
+       "igrave": "\u00EC",
+       "iacute": "\u00ED",
+       "icirc": "\u00EE",
+       "iuml": "\u00EF",
+       "eth": "\u00F0",
+       "ntilde": "\u00F1",
+       "ograve": "\u00F2",
+       "oacute": "\u00F3",
+       "ocirc": "\u00F4",
+       "otilde": "\u00F5",
+       "ouml": "\u00F6",
+       "divide": "\u00F7",
+       "oslash": "\u00F8",
+       "ugrave": "\u00F9",
+       "uacute": "\u00FA",
+       "ucirc": "\u00FB",
+       "uuml": "\u00FC",
+       "yacute": "\u00FD",
+       "thorn": "\u00FE",
+       "yuml": "\u00FF",
+       "fnof": "\u0192",
+       "Alpha": "\u0391",
+       "Beta": "\u0392",
+       "Gamma": "\u0393",
+       "Delta": "\u0394",
+       "Epsilon": "\u0395",
+       "Zeta": "\u0396",
+       "Eta": "\u0397",
+       "Theta": "\u0398",
+       "Iota": "\u0399",
+       "Kappa": "\u039A",
+       "Lambda": "\u039B",
+       "Mu": "\u039C",
+       "Nu": "\u039D",
+       "Xi": "\u039E",
+       "Omicron": "\u039F",
+       "Pi": "\u03A0",
+       "Rho": "\u03A1",
+       "Sigma": "\u03A3",
+       "Tau": "\u03A4",
+       "Upsilon": "\u03A5",
+       "Phi": "\u03A6",
+       "Chi": "\u03A7",
+       "Psi": "\u03A8",
+       "Omega": "\u03A9",
+       "alpha": "\u03B1",
+       "beta": "\u03B2",
+       "gamma": "\u03B3",
+       "delta": "\u03B4",
+       "epsilon": "\u03B5",
+       "zeta": "\u03B6",
+       "eta": "\u03B7",
+       "theta": "\u03B8",
+       "iota": "\u03B9",
+       "kappa": "\u03BA",
+       "lambda": "\u03BB",
+       "mu": "\u03BC",
+       "nu": "\u03BD",
+       "xi": "\u03BE",
+       "omicron": "\u03BF",
+       "pi": "\u03C0",
+       "rho": "\u03C1",
+       "sigmaf": "\u03C2",
+       "sigma": "\u03C3",
+       "tau": "\u03C4",
+       "upsilon": "\u03C5",
+       "phi": "\u03C6",
+       "chi": "\u03C7",
+       "psi": "\u03C8",
+       "omega": "\u03C9",
+       "thetasym": "\u03D1",
+       "upsih": "\u03D2",
+       "piv": "\u03D6",
+       "bull": "\u2022",
+       "hellip": "\u2026",
+       "prime": "\u2032",
+       "Prime": "\u2033",
+       "oline": "\u203E",
+       "frasl": "\u2044",
+       "weierp": "\u2118",
+       "image": "\u2111",
+       "real": "\u211C",
+       "trade": "\u2122",
+       "alefsym": "\u2135",
+       "larr": "\u2190",
+       "uarr": "\u2191",
+       "rarr": "\u2192",
+       "darr": "\u2193",
+       "harr": "\u2194",
+       "crarr": "\u21B5",
+       "lArr": "\u21D0",
+       "uArr": "\u21D1",
+       "rArr": "\u21D2",
+       "dArr": "\u21D3",
+       "hArr": "\u21D4",
+       "forall": "\u2200",
+       "part": "\u2202",
+       "exist": "\u2203",
+       "empty": "\u2205",
+       "nabla": "\u2207",
+       "isin": "\u2208",
+       "notin": "\u2209",
+       "ni": "\u220B",
+       "prod": "\u220F",
+       "sum": "\u2211",
+       "minus": "\u2212",
+       "lowast": "\u2217",
+       "radic": "\u221A",
+       "prop": "\u221D",
+       "infin": "\u221E",
+       "ang": "\u2220",
+       "and": "\u2227",
+       "or": "\u2228",
+       "cap": "\u2229",
+       "cup": "\u222A",
+       "int": "\u222B",
+       "there4": "\u2234",
+       "sim": "\u223C",
+       "cong": "\u2245",
+       "asymp": "\u2248",
+       "ne": "\u2260",
+       "equiv": "\u2261",
+       "le": "\u2264",
+       "ge": "\u2265",
+       "sub": "\u2282",
+       "sup": "\u2283",
+       "nsub": "\u2284",
+       "sube": "\u2286",
+       "supe": "\u2287",
+       "oplus": "\u2295",
+       "otimes": "\u2297",
+       "perp": "\u22A5",
+       "sdot": "\u22C5",
+       "lceil": "\u2308",
+       "rceil": "\u2309",
+       "lfloor": "\u230A",
+       "rfloor": "\u230B",
+       "lang": "\u2329",
+       "rang": "\u232A",
+       "loz": "\u25CA",
+       "spades": "\u2660",
+       "clubs": "\u2663",
+       "hearts": "\u2665",
+       "diams": "\u2666",
+       "quot": "\u0022",
+       "amp": "\u0026",
+       "lt": "\u003C",
+       "gt": "\u003E",
+       "OElig": "\u0152",
+       "oelig": "\u0153",
+       "Scaron": "\u0160",
+       "scaron": "\u0161",
+       "Yuml": "\u0178",
+       "circ": "\u02C6",
+       "tilde": "\u02DC",
+       "ensp": "\u2002",
+       "emsp": "\u2003",
+       "thinsp": "\u2009",
+       "zwnj": "\u200C",
+       "zwj": "\u200D",
+       "lrm": "\u200E",
+       "rlm": "\u200F",
+       "ndash": "\u2013",
+       "mdash": "\u2014",
+       "lsquo": "\u2018",
+       "rsquo": "\u2019",
+       "sbquo": "\u201A",
+       "ldquo": "\u201C",
+       "rdquo": "\u201D",
+       "bdquo": "\u201E",
+       "dagger": "\u2020",
+       "Dagger": "\u2021",
+       "permil": "\u2030",
+       "lsaquo": "\u2039",
+       "rsaquo": "\u203A",
+       "euro": "\u20AC",
+}
+
+// HTMLAutoClose is the set of HTML elements that
+// should be considered to close automatically.
+var HTMLAutoClose = htmlAutoClose
+
+var htmlAutoClose = []string {
+/*
+       hget http://www.w3.org/TR/html4/loose.dtd |
+       9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/ "\1",/p' | tr A-Z a-z
+*/
+       "basefont",
+       "br",
+       "area",
+       "link",
+       "img",
+       "param",
+       "hr",
+       "input",
+       "col     ",
+       "frame",
+       "isindex",
+       "base",
+       "meta",
+}