From: David Symonds Date: Mon, 6 Jun 2011 06:46:14 +0000 (+1000) Subject: mail: address list parsing. X-Git-Tag: weekly.2011-06-09~56 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=ff0198b72fd6753975671cb2d46cd93747d37d2d;p=gostls13.git mail: address list parsing. R=golang-dev, r, r CC=golang-dev https://golang.org/cl/4547084 --- diff --git a/src/pkg/mail/message.go b/src/pkg/mail/message.go index 9723863fee..50d89d3574 100644 --- a/src/pkg/mail/message.go +++ b/src/pkg/mail/message.go @@ -2,17 +2,42 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// Package mail implements parsing of mail messages according to RFC 5322. +/* +Package mail implements parsing of mail messages. + +For the most part, this package follows the syntax as specified by RFC 5322. +Notable divergences: + * Obsolete address formats are not parsed, including addresses with + embedded route information. + * Group addresses are not parsed. + * The full range of spacing (the CFWS syntax element) is not supported, + such as breaking addresses across lines. +*/ package mail import ( "bufio" + "bytes" + "fmt" "io" + "log" "net/textproto" "os" + "strconv" + "strings" "time" ) +var debug = debugT(false) + +type debugT bool + +func (d debugT) Printf(format string, args ...interface{}) { + if d { + log.Printf(format, args...) + } +} + // A Message represents a parsed mail message. type Message struct { Header Header @@ -93,3 +118,277 @@ func (h Header) Date() (*time.Time, os.Error) { } return parseDate(hdr) } + +// AddressList parses the named header field as a list of addresses. +func (h Header) AddressList(key string) ([]*Address, os.Error) { + hdr := h.Get(key) + if hdr == "" { + return nil, ErrHeaderNotPresent + } + return newAddrParser(hdr).parseAddressList() +} + +// Address represents a single mail address. +// An address such as "Barry Gibbs " is represented +// as Address{Name: "Barry Gibbs", Address: "bg@example.com"}. +type Address struct { + Name string // Proper name; may be empty. + Address string // user@domain +} + +func (a *Address) String() string { + s := "<" + a.Address + ">" + if a.Name == "" { + return s + } + return "\"" + strconv.Quote(a.Name) + "\" " + s +} + +type addrParser []byte + +func newAddrParser(s string) *addrParser { + p := addrParser([]byte(s)) + return &p +} + +func (p *addrParser) parseAddressList() ([]*Address, os.Error) { + var list []*Address + for { + p.skipSpace() + addr, err := p.parseAddress() + if err != nil { + return nil, err + } + list = append(list, addr) + + p.skipSpace() + if p.empty() { + break + } + if !p.consume(',') { + return nil, os.ErrorString("mail: expected comma") + } + } + return list, nil +} + +// parseAddress parses a single RFC 5322 address at the start of p. +func (p *addrParser) parseAddress() (addr *Address, err os.Error) { + debug.Printf("parseAddress: %q", *p) + p.skipSpace() + if p.empty() { + return nil, os.ErrorString("mail: no address") + } + + // address = name-addr / addr-spec + // TODO(dsymonds): Support parsing group address. + + // addr-spec has a more restricted grammar than name-addr, + // so try parsing it first, and fallback to name-addr. + // TODO(dsymonds): Is this really correct? + spec, err := p.consumeAddrSpec() + if err == nil { + return &Address{ + Address: spec, + }, err + } + debug.Printf("parseAddress: not an addr-spec: %v", err) + debug.Printf("parseAddress: state is now %q", *p) + + // display-name + var displayName string + if p.peek() != '<' { + displayName, err = p.consumePhrase() + if err != nil { + return nil, err + } + } + debug.Printf("parseAddress: displayName=%q", displayName) + + // angle-addr = "<" addr-spec ">" + p.skipSpace() + if !p.consume('<') { + return nil, os.ErrorString("mail: no angle-addr") + } + spec, err = p.consumeAddrSpec() + if err != nil { + return nil, err + } + if !p.consume('>') { + return nil, os.ErrorString("mail: unclosed angle-addr") + } + debug.Printf("parseAddress: spec=%q", spec) + + return &Address{ + Name: displayName, + Address: spec, + }, nil +} + +// consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p. +func (p *addrParser) consumeAddrSpec() (spec string, err os.Error) { + debug.Printf("consumeAddrSpec: %q", *p) + + orig := *p + defer func() { + if err != nil { + *p = orig + } + }() + + // local-part = dot-atom / quoted-string + var localPart string + p.skipSpace() + if p.empty() { + return "", os.ErrorString("mail: no addr-spec") + } + if p.peek() == '"' { + // quoted-string + debug.Printf("consumeAddrSpec: parsing quoted-string") + localPart, err = p.consumeQuotedString() + } else { + // dot-atom + debug.Printf("consumeAddrSpec: parsing dot-atom") + localPart, err = p.consumeAtom(true) + } + if err != nil { + debug.Printf("consumeAddrSpec: failed: %v", err) + return "", err + } + + if !p.consume('@') { + return "", os.ErrorString("mail: missing @ in addr-spec") + } + + // domain = dot-atom / domain-literal + var domain string + p.skipSpace() + if p.empty() { + return "", os.ErrorString("mail: no domain in addr-spec") + } + // TODO(dsymonds): Handle domain-literal + domain, err = p.consumeAtom(true) + if err != nil { + return "", err + } + + return localPart + "@" + domain, nil +} + +// consumePhrase parses the RFC 5322 phrase at the start of p. +func (p *addrParser) consumePhrase() (phrase string, err os.Error) { + debug.Printf("consumePhrase: [%s]", *p) + // phrase = 1*word + var words []string + for { + // word = atom / quoted-string + var word string + p.skipSpace() + if p.empty() { + return "", os.ErrorString("mail: missing phrase") + } + if p.peek() == '"' { + // quoted-string + word, err = p.consumeQuotedString() + } else { + // atom + word, err = p.consumeAtom(false) + } + if err != nil { + break + } + debug.Printf("consumePhrase: consumed %q", word) + words = append(words, word) + } + // Ignore any error if we got at least one word. + if err != nil && len(words) == 0 { + debug.Printf("consumePhrase: hit err: %v", err) + return "", os.ErrorString("mail: missing word in phrase") + } + return strings.Join(words, " "), nil +} + +// consumeQuotedString parses the quoted string at the start of p. +func (p *addrParser) consumeQuotedString() (qs string, err os.Error) { + // Assume first byte is '"'. + i := 1 + qsb := make([]byte, 0, 10) +Loop: + for { + if i >= p.len() { + return "", os.ErrorString("mail: unclosed quoted-string") + } + switch c := (*p)[i]; { + case c == '"': + break Loop + case c == '\\': + if i+1 == p.len() { + return "", os.ErrorString("mail: unclosed quoted-string") + } + qsb = append(qsb, (*p)[i+1]) + i += 2 + case '!' <= c && c <= '~', c == ' ' || c == '\t': + // qtext (printable US-ASCII excluding " and \), or + // FWS (almost; we're ignoring CRLF) + qsb = append(qsb, c) + i++ + default: + return "", fmt.Errorf("mail: bad character in quoted-string: %q", c) + } + } + *p = (*p)[i+1:] + return string(qsb), nil +} + +// consumeAtom parses an RFC 5322 atom at the start of p. +// If dot is true, consumeAtom parses an RFC 5322 dot-atom instead. +func (p *addrParser) consumeAtom(dot bool) (atom string, err os.Error) { + if !isAtext(p.peek(), false) { + return "", os.ErrorString("mail: invalid string") + } + i := 1 + for ; i < p.len() && isAtext((*p)[i], dot); i++ { + } + // TODO(dsymonds): Remove the []byte() conversion here when 6g doesn't need it. + atom, *p = string([]byte((*p)[:i])), (*p)[i:] + return atom, nil +} + +func (p *addrParser) consume(c byte) bool { + if p.empty() || p.peek() != c { + return false + } + *p = (*p)[1:] + return true +} + +// skipSpace skips the leading space and tab characters. +func (p *addrParser) skipSpace() { + *p = bytes.TrimLeft(*p, " \t") +} + +func (p *addrParser) peek() byte { + return (*p)[0] +} + +func (p *addrParser) empty() bool { + return p.len() == 0 +} + +func (p *addrParser) len() int { + return len(*p) +} + +var atextChars = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + + "abcdefghijklmnopqrstuvwxyz" + + "0123456789" + + "!#$%&'*+-/=?^_`{|}~") + +// isAtext returns true if c is an RFC 5322 atext character. +// If dot is true, period is included. +func isAtext(c byte, dot bool) bool { + if dot && c == '.' { + return true + } + return bytes.IndexByte(atextChars, c) >= 0 +} diff --git a/src/pkg/mail/message_test.go b/src/pkg/mail/message_test.go index 1d1c6352ea..c3ec236816 100644 --- a/src/pkg/mail/message_test.go +++ b/src/pkg/mail/message_test.go @@ -127,3 +127,74 @@ func TestDateParsing(t *testing.T) { } } } + +func TestAddressParsing(t *testing.T) { + tests := []struct { + addrsStr string + exp []*Address + }{ + // Bare address + { + `jdoe@machine.example`, + []*Address{&Address{ + Address: "jdoe@machine.example", + }}, + }, + // RFC 5322, Appendix A.1.1 + { + `John Doe `, + []*Address{&Address{ + Name: "John Doe", + Address: "jdoe@machine.example", + }}, + }, + // RFC 5322, Appendix A.1.2 + { + `"Joe Q. Public" `, + []*Address{&Address{ + Name: "Joe Q. Public", + Address: "john.q.public@example.com", + }}, + }, + { + `Mary Smith , jdoe@example.org, Who? `, + []*Address{ + &Address{ + Name: "Mary Smith", + Address: "mary@x.test", + }, + &Address{ + Address: "jdoe@example.org", + }, + &Address{ + Name: "Who?", + Address: "one@y.test", + }, + }, + }, + { + `, "Giant; \"Big\" Box" `, + []*Address{ + &Address{ + Address: "boss@nil.test", + }, + &Address{ + Name: `Giant; "Big" Box`, + Address: "sysservices@example.net", + }, + }, + }, + // RFC 5322, Appendix A.1.3 + // TODO(dsymonds): Group addresses. + } + for _, test := range tests { + addrs, err := newAddrParser(test.addrsStr).parseAddressList() + if err != nil { + t.Errorf("Failed parsing %q: %v", test.addrsStr, err) + continue + } + if !reflect.DeepEqual(addrs, test.exp) { + t.Errorf("Parse of %q: got %+v, want %+v", test.addrsStr, addrs, test.exp) + } + } +}