]> Cypherpunks repositories - gostls13.git/commitdiff
go/doc/comment: add low-level parsing helpers
authorRuss Cox <rsc@golang.org>
Sun, 3 Apr 2022 12:00:55 +0000 (08:00 -0400)
committerRuss Cox <rsc@golang.org>
Mon, 11 Apr 2022 16:31:36 +0000 (16:31 +0000)
[This CL is part of a sequence implementing the proposal #51082.
The design doc is at https://go.dev/s/godocfmt-design.]

Implement helpers to recognize old-style headings,
plain text (not marked up) URLs, and Go identifiers.

For #51082.

Change-Id: Ibabce72ef3ffd79a9d33366091f8c76ef27d0182
Reviewed-on: https://go-review.googlesource.com/c/go/+/397277
Run-TryBot: Russ Cox <rsc@golang.org>
Reviewed-by: Jonathan Amsterdam <jba@google.com>
Reviewed-by: Ian Lance Taylor <iant@golang.org>
TryBot-Result: Gopher Robot <gobot@golang.org>

src/go/doc/comment/old_test.go [new file with mode: 0644]
src/go/doc/comment/parse.go

diff --git a/src/go/doc/comment/old_test.go b/src/go/doc/comment/old_test.go
new file mode 100644 (file)
index 0000000..944f94d
--- /dev/null
@@ -0,0 +1,80 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// These tests are carried forward from the old go/doc implementation.
+
+package comment
+
+import "testing"
+
+var oldHeadingTests = []struct {
+       line string
+       ok   bool
+}{
+       {"Section", true},
+       {"A typical usage", true},
+       {"ΔΛΞ is Greek", true},
+       {"Foo 42", true},
+       {"", false},
+       {"section", false},
+       {"A typical usage:", false},
+       {"This code:", false},
+       {"δ is Greek", false},
+       {"Foo §", false},
+       {"Fermat's Last Sentence", true},
+       {"Fermat's", true},
+       {"'sX", false},
+       {"Ted 'Too' Bar", false},
+       {"Use n+m", false},
+       {"Scanning:", false},
+       {"N:M", false},
+}
+
+func TestIsOldHeading(t *testing.T) {
+       for _, tt := range oldHeadingTests {
+               if isOldHeading(tt.line, []string{"Text.", "", tt.line, "", "Text."}, 2) != tt.ok {
+                       t.Errorf("isOldHeading(%q) = %v, want %v", tt.line, !tt.ok, tt.ok)
+               }
+       }
+}
+
+var autoURLTests = []struct {
+       in, out string
+}{
+       {"", ""},
+       {"http://[::1]:8080/foo.txt", "http://[::1]:8080/foo.txt"},
+       {"https://www.google.com) after", "https://www.google.com"},
+       {"https://www.google.com:30/x/y/z:b::c. After", "https://www.google.com:30/x/y/z:b::c"},
+       {"http://www.google.com/path/:;!-/?query=%34b#093124", "http://www.google.com/path/:;!-/?query=%34b#093124"},
+       {"http://www.google.com/path/:;!-/?query=%34bar#093124", "http://www.google.com/path/:;!-/?query=%34bar#093124"},
+       {"http://www.google.com/index.html! After", "http://www.google.com/index.html"},
+       {"http://www.google.com/", "http://www.google.com/"},
+       {"https://www.google.com/", "https://www.google.com/"},
+       {"http://www.google.com/path.", "http://www.google.com/path"},
+       {"http://en.wikipedia.org/wiki/Camellia_(cipher)", "http://en.wikipedia.org/wiki/Camellia_(cipher)"},
+       {"http://www.google.com/)", "http://www.google.com/"},
+       {"http://gmail.com)", "http://gmail.com"},
+       {"http://gmail.com))", "http://gmail.com"},
+       {"http://gmail.com ((http://gmail.com)) ()", "http://gmail.com"},
+       {"http://example.com/ quux!", "http://example.com/"},
+       {"http://example.com/%2f/ /world.", "http://example.com/%2f/"},
+       {"http: ipsum //host/path", ""},
+       {"javascript://is/not/linked", ""},
+       {"http://foo", "http://foo"},
+       {"https://www.example.com/person/][Person Name]]", "https://www.example.com/person/"},
+       {"http://golang.org/)", "http://golang.org/"},
+       {"http://golang.org/hello())", "http://golang.org/hello()"},
+       {"http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD", "http://git.qemu.org/?p=qemu.git;a=blob;f=qapi-schema.json;hb=HEAD"},
+       {"https://foo.bar/bal/x(])", "https://foo.bar/bal/x"}, // inner ] causes (]) to be cut off from URL
+       {"http://bar(])", "http://bar"},                       // same
+}
+
+func TestAutoURL(t *testing.T) {
+       for _, tt := range autoURLTests {
+               url, ok := autoURL(tt.in)
+               if url != tt.out || ok != (tt.out != "") {
+                       t.Errorf("autoURL(%q) = %q, %v, want %q, %v", tt.in, url, ok, tt.out, tt.out != "")
+               }
+       }
+}
index 672b115bf8f8c7e334198b8edaa302494d025075..12b66794135626e458de50b41982071efda9591d 100644 (file)
@@ -4,6 +4,12 @@
 
 package comment
 
+import (
+       "strings"
+       "unicode"
+       "unicode/utf8"
+)
+
 // A Doc is a parsed Go doc comment.
 type Doc struct {
        // Content is the sequence of content blocks in the comment.
@@ -167,3 +173,291 @@ type DocLink struct {
 }
 
 func (*DocLink) text() {}
+
+// leadingSpace returns the longest prefix of s consisting of spaces and tabs.
+func leadingSpace(s string) string {
+       i := 0
+       for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
+               i++
+       }
+       return s[:i]
+}
+
+// isOldHeading reports whether line is an old-style section heading.
+// line is all[off].
+func isOldHeading(line string, all []string, off int) bool {
+       if off <= 0 || all[off-1] != "" || off+2 >= len(all) || all[off+1] != "" || leadingSpace(all[off+2]) != "" {
+               return false
+       }
+
+       line = strings.TrimSpace(line)
+
+       // a heading must start with an uppercase letter
+       r, _ := utf8.DecodeRuneInString(line)
+       if !unicode.IsLetter(r) || !unicode.IsUpper(r) {
+               return false
+       }
+
+       // it must end in a letter or digit:
+       r, _ = utf8.DecodeLastRuneInString(line)
+       if !unicode.IsLetter(r) && !unicode.IsDigit(r) {
+               return false
+       }
+
+       // exclude lines with illegal characters. we allow "(),"
+       if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") {
+               return false
+       }
+
+       // allow "'" for possessive "'s" only
+       for b := line; ; {
+               var ok bool
+               if _, b, ok = strings.Cut(b, "'"); !ok {
+                       break
+               }
+               if b != "s" && !strings.HasPrefix(b, "s ") {
+                       return false // ' not followed by s and then end-of-word
+               }
+       }
+
+       // allow "." when followed by non-space
+       for b := line; ; {
+               var ok bool
+               if _, b, ok = strings.Cut(b, "."); !ok {
+                       break
+               }
+               if b == "" || strings.HasPrefix(b, " ") {
+                       return false // not followed by non-space
+               }
+       }
+
+       return true
+}
+
+// autoURL checks whether s begins with a URL that should be hyperlinked.
+// If so, it returns the URL, which is a prefix of s, and ok == true.
+// Otherwise it returns "", false.
+// The caller should skip over the first len(url) bytes of s
+// before further processing.
+func autoURL(s string) (url string, ok bool) {
+       // Find the ://. Fast path to pick off non-URL,
+       // since we call this at every position in the string.
+       // The shortest possible URL is ftp://x, 7 bytes.
+       var i int
+       switch {
+       case len(s) < 7:
+               return "", false
+       case s[3] == ':':
+               i = 3
+       case s[4] == ':':
+               i = 4
+       case s[5] == ':':
+               i = 5
+       case s[6] == ':':
+               i = 6
+       default:
+               return "", false
+       }
+       if i+3 > len(s) || s[i:i+3] != "://" {
+               return "", false
+       }
+
+       // Check valid scheme.
+       if !isScheme(s[:i]) {
+               return "", false
+       }
+
+       // Scan host part. Must have at least one byte,
+       // and must start and end in non-punctuation.
+       i += 3
+       if i >= len(s) || !isHost(s[i]) || isPunct(s[i]) {
+               return "", false
+       }
+       i++
+       end := i
+       for i < len(s) && isHost(s[i]) {
+               if !isPunct(s[i]) {
+                       end = i + 1
+               }
+               i++
+       }
+       i = end
+
+       // At this point we are definitely returning a URL (scheme://host).
+       // We just have to find the longest path we can add to it.
+       // Heuristics abound.
+       // We allow parens, braces, and brackets,
+       // but only if they match (#5043, #22285).
+       // We allow .,:;?! in the path but not at the end,
+       // to avoid end-of-sentence punctuation (#18139, #16565).
+       stk := []byte{}
+       end = i
+Path:
+       for ; i < len(s); i++ {
+               if isPunct(s[i]) {
+                       continue
+               }
+               if !isPath(s[i]) {
+                       break
+               }
+               switch s[i] {
+               case '(':
+                       stk = append(stk, ')')
+               case '{':
+                       stk = append(stk, '}')
+               case '[':
+                       stk = append(stk, ']')
+               case ')', '}', ']':
+                       if len(stk) == 0 || stk[len(stk)-1] != s[i] {
+                               break Path
+                       }
+                       stk = stk[:len(stk)-1]
+               }
+               if len(stk) == 0 {
+                       end = i + 1
+               }
+       }
+
+       return s[:end], true
+}
+
+// isScheme reports whether s is a recognized URL scheme.
+// Note that if strings of new length (beyond 3-7)
+// are added here, the fast path at the top of autoURL will need updating.
+func isScheme(s string) bool {
+       switch s {
+       case "file",
+               "ftp",
+               "gopher",
+               "http",
+               "https",
+               "mailto",
+               "nntp":
+               return true
+       }
+       return false
+}
+
+// isHost reports whether c is a byte that can appear in a URL host,
+// like www.example.com or user@[::1]:8080
+func isHost(c byte) bool {
+       // mask is a 128-bit bitmap with 1s for allowed bytes,
+       // so that the byte c can be tested with a shift and an and.
+       // If c > 128, then 1<<c and 1<<(c-64) will both be zero,
+       // and this function will return false.
+       const mask = 0 |
+               (1<<26-1)<<'A' |
+               (1<<26-1)<<'a' |
+               (1<<10-1)<<'0' |
+               1<<'_' |
+               1<<'@' |
+               1<<'-' |
+               1<<'.' |
+               1<<'[' |
+               1<<']' |
+               1<<':'
+
+       return ((uint64(1)<<c)&(mask&(1<<64-1)) |
+               (uint64(1)<<(c-64))&(mask>>64)) != 0
+}
+
+// isPunct reports whether c is a punctuation byte that can appear
+// inside a path but not at the end.
+func isPunct(c byte) bool {
+       // mask is a 128-bit bitmap with 1s for allowed bytes,
+       // so that the byte c can be tested with a shift and an and.
+       // If c > 128, then 1<<c and 1<<(c-64) will both be zero,
+       // and this function will return false.
+       const mask = 0 |
+               1<<'.' |
+               1<<',' |
+               1<<':' |
+               1<<';' |
+               1<<'?' |
+               1<<'!'
+
+       return ((uint64(1)<<c)&(mask&(1<<64-1)) |
+               (uint64(1)<<(c-64))&(mask>>64)) != 0
+}
+
+// isPath reports whether c is a (non-punctuation) path byte.
+func isPath(c byte) bool {
+       // mask is a 128-bit bitmap with 1s for allowed bytes,
+       // so that the byte c can be tested with a shift and an and.
+       // If c > 128, then 1<<c and 1<<(c-64) will both be zero,
+       // and this function will return false.
+       const mask = 0 |
+               (1<<26-1)<<'A' |
+               (1<<26-1)<<'a' |
+               (1<<10-1)<<'0' |
+               1<<'$' |
+               1<<'\'' |
+               1<<'(' |
+               1<<')' |
+               1<<'*' |
+               1<<'+' |
+               1<<'&' |
+               1<<'#' |
+               1<<'=' |
+               1<<'@' |
+               1<<'~' |
+               1<<'_' |
+               1<<'/' |
+               1<<'-' |
+               1<<'[' |
+               1<<']' |
+               1<<'{' |
+               1<<'}' |
+               1<<'%'
+
+       return ((uint64(1)<<c)&(mask&(1<<64-1)) |
+               (uint64(1)<<(c-64))&(mask>>64)) != 0
+}
+
+// isName reports whether s is a capitalized Go identifier (like Name).
+func isName(s string) bool {
+       t, ok := ident(s)
+       if !ok || t != s {
+               return false
+       }
+       r, _ := utf8.DecodeRuneInString(s)
+       return unicode.IsUpper(r)
+}
+
+// ident checks whether s begins with a Go identifier.
+// If so, it returns the identifier, which is a prefix of s, and ok == true.
+// Otherwise it returns "", false.
+// The caller should skip over the first len(id) bytes of s
+// before further processing.
+func ident(s string) (id string, ok bool) {
+       // Scan [\pL_][\pL_0-9]*
+       n := 0
+       for n < len(s) {
+               if c := s[n]; c < utf8.RuneSelf {
+                       if isIdentASCII(c) && (n > 0 || c < '0' || c > '9') {
+                               n++
+                               continue
+                       }
+                       break
+               }
+               r, nr := utf8.DecodeRuneInString(s)
+               if unicode.IsLetter(r) {
+                       n += nr
+                       continue
+               }
+               break
+       }
+       return s[:n], n > 0
+}
+
+// isIdentASCII reports whether c is an ASCII identifier byte.
+func isIdentASCII(c byte) bool {
+       const mask = 0 |
+               (1<<26-1)<<'A' |
+               (1<<26-1)<<'a' |
+               (1<<10-1)<<'0' |
+               1<<'_'
+
+       return ((uint64(1)<<c)&(mask&(1<<64-1)) |
+               (uint64(1)<<(c-64))&(mask>>64)) != 0
+}