From: Russ Cox Date: Mon, 5 Oct 2009 22:10:00 +0000 (-0700) Subject: XML parser X-Git-Tag: weekly.2009-11-06~410 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=0ddfe70c27304f280779ba6cead1d1c41f506a0a;p=gostls13.git XML parser R=r DELTA=546 (545 added, 0 deleted, 1 changed) OCL=35318 CL=35341 --- diff --git a/src/pkg/Make.deps b/src/pkg/Make.deps index 1f85b2c398..9f95da5409 100644 --- a/src/pkg/Make.deps +++ b/src/pkg/Make.deps @@ -30,7 +30,7 @@ expvar.install: bytes.install fmt.install http.install log.install strconv.insta flag.install: fmt.install os.install strconv.install fmt.install: io.install os.install reflect.install strconv.install utf8.install go/ast.install: go/token.install unicode.install utf8.install -go/doc.install: container/vector.install go/ast.install go/token.install io.install once.install regexp.install sort.install strings.install template.install +go/doc.install: container/vector.install go/ast.install go/token.install io.install regexp.install sort.install strings.install template.install go/parser.install: bytes.install container/vector.install fmt.install go/ast.install go/scanner.install go/token.install io.install os.install path.install strings.install go/printer.install: bytes.install container/vector.install fmt.install go/ast.install go/token.install io.install os.install reflect.install runtime.install strings.install tabwriter.install go/scanner.install: bytes.install container/vector.install fmt.install go/token.install io.install os.install sort.install strconv.install unicode.install utf8.install @@ -68,3 +68,4 @@ testing/iotest.install: bytes.install io.install log.install os.install time.install: io.install once.install os.install syscall.install unicode.install: utf8.install: unicode.install +xml.install: bufio.install bytes.install io.install os.install reflect.install strconv.install strings.install unicode.install utf8.install diff --git a/src/pkg/Makefile b/src/pkg/Makefile index 6dd11f93e3..13899671d5 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -82,6 +82,7 @@ DIRS=\ time\ unicode\ utf8\ + xml\ NOTEST=\ debug/proc\ diff --git a/src/pkg/xml/Makefile b/src/pkg/xml/Makefile new file mode 100644 index 0000000000..3c005c6059 --- /dev/null +++ b/src/pkg/xml/Makefile @@ -0,0 +1,13 @@ +# Copyright 2009 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include $(GOROOT)/src/Make.$(GOARCH) + +TARG=xml + +GOFILES=\ + read.go\ + xml.go\ + +include $(GOROOT)/src/Make.pkg diff --git a/src/pkg/xml/read.go b/src/pkg/xml/read.go new file mode 100644 index 0000000000..c6f81ee5ca --- /dev/null +++ b/src/pkg/xml/read.go @@ -0,0 +1,320 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xml + +import ( + "bytes"; + "io"; + "os"; + "reflect"; + "strings"; +) + +// BUG(rsc): Mapping between XML elements and data structures is inherently flawed: +// an XML element is an order-dependent collection of anonymous +// values, while a data structure is an order-independent collection +// of named values. +// See package json for a textual representation more suitable +// to data structures. + +// Unmarshal parses an XML element from r and uses the +// reflect library to fill in an arbitrary struct, slice, or string +// pointed at by val. Well-formed data that does not fit +// into val is discarded. +// +// For example, given these definitions: +// +// type Email struct { +// Where string "attr"; +// Addr string; +// } +// +// type Result struct { +// XMLName xml.Name "result"; +// Name string; +// Phone string; +// Email []Email; +// } +// +// var result = Result{ "name", "phone", nil } +// +// unmarshalling the XML input +// +// +// +// gre@example.com +// +// +// gre@work.com +// +// Grace R. Emlin +//
123 Main Street
+//
+// +// via Unmarshal(r, &result) is equivalent to assigning +// +// r = Result{ +// xml.Name{"", "result"}, +// "Grace R. Emlin", // name +// "phone", // no phone given +// []Email{ +// Email{ "home", "gre@example.com" }, +// Email{ "work", "gre@work.com" } +// } +// } +// +// Note that the field r.Phone has not been modified and +// that the XML
element was discarded. +// +// Because Unmarshal uses the reflect package, it can only +// assign to upper case fields. Unmarshal uses a case-insensitive +// comparison to match XML element names to struct field names. +// +// Unmarshal maps an XML element to a struct using the following rules: +// +// * If the struct has a field named XMLName of type xml.Name, +// Unmarshal records the element name in that field. +// +// * If the XMLName field has an associated tag string of the form +// "tag" or "namespace-URL tag", the XML element must have +// the given tag (and, optionally, name space) or else Unmarshal +// returns an error. +// +// * If the XML element has an attribute whose name matches a +// struct field of type string with tag "attr", Unmarshal records +// the attribute value in that field. +// +// * If the XML element contains character data, that data is +// accumulated in the first struct field that has tag "chardata". +// The struct field may have type []byte or string. +// If there is no such field, the character data is discarded. +// +// * If the XML element contains a sub-element whose name +// matches a struct field whose tag is neither "attr" nor "chardata", +// Unmarshal maps the sub-element to that struct field. +// +// Unmarshal maps an XML element to a string or []byte by saving the +// concatenation of that elements character data in the string or []byte. +// +// Unmarshal maps an XML element to a slice by extending the length +// of the slice and mapping the element to the newly created value. +// +func Unmarshal(r io.Reader, val interface{}) os.Error { + v, ok := reflect.NewValue(val).(*reflect.PtrValue); + if !ok { + return os.NewError("non-pointer passed to Unmarshal"); + } + p := NewParser(r); + elem := v.Elem(); + for { + err := p.unmarshal(elem, nil); + if err != nil { + if err == os.EOF { + break; + } + return err; + } + } + return nil; +} + +// An UnmarshalError represents an error in the unmarshalling process. +type UnmarshalError string +func (e UnmarshalError) String() string { + return string(e); +} + +// Unmarshal a single XML element into val. +func (p *Parser) unmarshal(val reflect.Value, start *StartElement) os.Error { + // Find start element if we need it. + if start == nil { + for { + tok, err := p.Token(); + if err != nil { + return err; + } + if t, ok := tok.(StartElement); ok { + start = &t; + break; + } + } + } + + var ( + data []byte; + saveData reflect.Value; + sv *reflect.StructValue; + styp *reflect.StructType; + ) + switch v := val.(type) { + case *reflect.SliceValue: + typ := v.Type().(*reflect.SliceType); + if _, ok := typ.Elem().(*reflect.Uint8Type); ok { + // []byte + saveData = v; + break; + } + + // Slice of element values. + // Grow slice. + n := v.Len(); + if n >= v.Cap() { + ncap := 2*n; + if ncap < 4 { + ncap = 4; + } + new := reflect.MakeSlice(typ, n, ncap); + reflect.ArrayCopy(new, v); + v.Set(new); + } + v.SetLen(n+1); + + // Recur to read element into slice. + if err := p.unmarshal(v.Elem(n), start); err != nil { + v.SetLen(n); + return err; + } + return nil; + + case *reflect.StringValue: + saveData = v; + + case *reflect.StructValue: + sv = v; + typ := sv.Type().(*reflect.StructType); + styp = typ; + // Assign name. + if f, ok := typ.FieldByName("XMLName"); ok { + // Validate element name. + if f.Tag != "" { + tag := f.Tag; + ns := ""; + i := strings.LastIndex(tag, " "); + if i >= 0 { + ns, tag = tag[0:i], tag[i+1:len(tag)]; + } + if tag != start.Name.Local { + return UnmarshalError("expected element type <" + tag + "> but have <" + start.Name.Local + ">"); + } + if ns != "" && ns != start.Name.Space { + e := "expected element <" + tag + "> in name space " + ns + " but have "; + if start.Name.Space == "" { + e += "no name space"; + } else { + e += start.Name.Space; + } + return UnmarshalError(e); + } + } + + // Save + v := sv.FieldByIndex(f.Index); + if _, ok := v.Interface().(Name); !ok { + return UnmarshalError(sv.Type().String() + " field XMLName does not have type xml.Name"); + } + v.(*reflect.StructValue).Set(reflect.NewValue(start.Name).(*reflect.StructValue)); + } + + // Assign attributes. + // Also, do we need to save character data? + for i, n := 0, typ.NumField(); i < n; i++ { + f := typ.Field(i); + switch f.Tag { + case "attr": + strv, ok := sv.FieldByIndex(f.Index).(*reflect.StringValue); + if !ok { + return UnmarshalError(sv.Type().String() + " field " + f.Name + " has attr tag but is not type string"); + } + // Look for attribute. + val := ""; + k := strings.ToLower(f.Name); + for _, a := range start.Attr { + if strings.ToLower(a.Name.Local) == k { + val = a.Value; + break; + } + } + strv.Set(val); + + case "chardata": + if saveData == nil { + saveData = sv.FieldByIndex(f.Index); + } + } + } + } + + // Find end element. + // Process sub-elements along the way. +Loop: + for { + tok, err := p.Token(); + if err != nil { + return err; + } + switch t := tok.(type) { + case StartElement: + // Sub-element. + if sv != nil { + k := strings.ToLower(t.Name.Local); + for i, n := 0, styp.NumField(); i < n; i++ { + f := styp.Field(i); + if strings.ToLower(f.Name) == k { + if err := p.unmarshal(sv.FieldByIndex(f.Index), &t); err != nil { + return err; + } + continue Loop; + } + } + } + // Not saving sub-element but still have to skip over it. + if err := p.skip(); err != nil { + return err; + } + + case EndElement: + break Loop; + + case CharData: + if saveData != nil { + data = bytes.Add(data, t); + } + } + } + + // Save accumulated character data + if saveData != nil { + switch t := saveData.(type) { + case *reflect.StringValue: + t.Set(string(data)); + case *reflect.SliceValue: + t.Set(reflect.NewValue(data).(*reflect.SliceValue)); + } + } + + return nil; +} + +// Have already read a start element. +// Read tokens until we find the end element. +// Token is taking care of making sure the +// end element matches the start element we saw. +func (p *Parser) skip() os.Error { + for { + tok, err := p.Token(); + if err != nil { + return err; + } + switch t := tok.(type) { + case StartElement: + if err := p.skip(); err != nil { + return err; + } + case EndElement: + return nil; + } + } + panic("unreachable"); +} diff --git a/src/pkg/xml/read_test.go b/src/pkg/xml/read_test.go new file mode 100644 index 0000000000..760d28b924 --- /dev/null +++ b/src/pkg/xml/read_test.go @@ -0,0 +1,210 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package xml + +import ( + "reflect"; + "testing"; +) + +// Stripped down Atom feed data structures. + +func TestUnmarshalFeed(t *testing.T) { + var f Feed; + if err := Unmarshal(StringReader(rssFeedString), &f); err != nil { + t.Fatalf("Unmarshal: %s", err); + } + if !reflect.DeepEqual(f, rssFeed) { + t.Fatalf("have %#v\nwant %#v\n\n%#v", f); + } +} + +// hget http://codereview.appspot.com/rss/mine/rsc +const rssFeedString = ` + +Code Review - My issueshttp://codereview.appspot.com/2009-10-04T01:35:58+00:00rietveldrietveld: an attempt at pubsubhubbub +2009-10-04T01:35:58+00:00email-address-removedurn:md5:134d9179c41f806be79b3a5f7877d19a + An attempt at adding pubsubhubbub support to Rietveld. +http://code.google.com/p/pubsubhubbub +http://code.google.com/p/rietveld/issues/detail?id=155 + +The server side of the protocol is trivial: + 1. add a &lt;link rel=&quot;hub&quot; href=&quot;hub-server&quot;&gt; tag to all + feeds that will be pubsubhubbubbed. + 2. every time one of those feeds changes, tell the hub + with a simple POST request. + +I have tested this by adding debug prints to a local hub +server and checking that the server got the right publish +requests. + +I can&#39;t quite get the server to work, but I think the bug +is not in my code. I think that the server expects to be +able to grab the feed and see the feed&#39;s actual URL in +the link rel=&quot;self&quot;, but the default value for that drops +the :port from the URL, and I cannot for the life of me +figure out how to get the Atom generator deep inside +django not to do that, or even where it is doing that, +or even what code is running to generate the Atom feed. +(I thought I knew but I added some assert False statements +and it kept running!) + +Ignoring that particular problem, I would appreciate +feedback on the right way to get the two values at +the top of feeds.py marked NOTE(rsc). + + +rietveld: correct tab handling +2009-10-03T23:02:17+00:00email-address-removedurn:md5:0a2a4f19bb815101f0ba2904aed7c35a + This fixes the buggy tab rendering that can be seen at +http://codereview.appspot.com/116075/diff/1/2 + +The fundamental problem was that the tab code was +not being told what column the text began in, so it +didn&#39;t know where to put the tab stops. Another problem +was that some of the code assumed that string byte +offsets were the same as column offsets, which is only +true if there are no tabs. + +In the process of fixing this, I cleaned up the arguments +to Fold and ExpandTabs and renamed them Break and +_ExpandTabs so that I could be sure that I found all the +call sites. I also wanted to verify that ExpandTabs was +not being used from outside intra_region_diff.py. + + +` + +type Feed struct { + XMLName Name "http://www.w3.org/2005/Atom feed"; + Title string; + Id string; + Link []Link; + Updated Time; + Author Person; + Entry []Entry; +} + +type Entry struct { + Title string; + Id string; + Link []Link; + Updated Time; + Author Person; + Summary Text; +} + +type Link struct { + Rel string "attr"; + Href string "attr"; +} + +type Person struct { + Name string; + URI string; + Email string; +} + +type Text struct { + Type string "attr"; + Body string "chardata"; +} + +type Time string + +var rssFeed = Feed{ + XMLName: Name{"http://www.w3.org/2005/Atom", "feed"}, + Title: "Code Review - My issues", + Link: []Link{ + Link{Rel: "alternate", Href: "http://codereview.appspot.com/"}, + Link{Rel: "self", Href: "http://codereview.appspot.com/rss/mine/rsc"}, + }, + Id: "http://codereview.appspot.com/", + Updated: "2009-10-04T01:35:58+00:00", + Author: Person{ + Name: "rietveld" + }, + Entry: []Entry{ + Entry{ + Title: "rietveld: an attempt at pubsubhubbub\n", + Link: []Link{ + Link{Rel: "alternate", Href: "http://codereview.appspot.com/126085"}, + }, + Updated: "2009-10-04T01:35:58+00:00", + Author: Person{ + Name: "email-address-removed" + }, + Id: "urn:md5:134d9179c41f806be79b3a5f7877d19a", + Summary: Text{ + Type: "html", + Body: ` + An attempt at adding pubsubhubbub support to Rietveld. +http://code.google.com/p/pubsubhubbub +http://code.google.com/p/rietveld/issues/detail?id=155 + +The server side of the protocol is trivial: + 1. add a <link rel="hub" href="hub-server"> tag to all + feeds that will be pubsubhubbubbed. + 2. every time one of those feeds changes, tell the hub + with a simple POST request. + +I have tested this by adding debug prints to a local hub +server and checking that the server got the right publish +requests. + +I can't quite get the server to work, but I think the bug +is not in my code. I think that the server expects to be +able to grab the feed and see the feed's actual URL in +the link rel="self", but the default value for that drops +the :port from the URL, and I cannot for the life of me +figure out how to get the Atom generator deep inside +django not to do that, or even where it is doing that, +or even what code is running to generate the Atom feed. +(I thought I knew but I added some assert False statements +and it kept running!) + +Ignoring that particular problem, I would appreciate +feedback on the right way to get the two values at +the top of feeds.py marked NOTE(rsc). + + +` + }, + }, + Entry{ + Title: "rietveld: correct tab handling\n", + Link: []Link{ + Link{Rel: "alternate", Href: "http://codereview.appspot.com/124106"}, + }, + Updated: "2009-10-03T23:02:17+00:00", + Author: Person{ + Name: "email-address-removed" + }, + Id: "urn:md5:0a2a4f19bb815101f0ba2904aed7c35a", + Summary: Text{ + Type: "html", + Body: ` + This fixes the buggy tab rendering that can be seen at +http://codereview.appspot.com/116075/diff/1/2 + +The fundamental problem was that the tab code was +not being told what column the text began in, so it +didn't know where to put the tab stops. Another problem +was that some of the code assumed that string byte +offsets were the same as column offsets, which is only +true if there are no tabs. + +In the process of fixing this, I cleaned up the arguments +to Fold and ExpandTabs and renamed them Break and +_ExpandTabs so that I could be sure that I found all the +call sites. I also wanted to verify that ExpandTabs was +not being used from outside intra_region_diff.py. + + +` + } + }, + } +}