cmd/go/internal/module: define fs-safe module path encoding

author Russ Cox <rsc@golang.org>

Mon, 16 Jul 2018 02:56:37 +0000 (22:56 -0400)

committer Russ Cox <rsc@golang.org>

Wed, 18 Jul 2018 02:08:57 +0000 (02:08 +0000)
author Russ Cox <rsc@golang.org>
Mon, 16 Jul 2018 02:56:37 +0000 (22:56 -0400)
committer Russ Cox <rsc@golang.org>
Wed, 18 Jul 2018 02:08:57 +0000 (02:08 +0000)
diff --git a/src/cmd/go/internal/module/module.go b/src/cmd/go/internal/module/module.go

index 7b32b24abf130c833c3b767ce830f9a4d79d3c40..000699a0adc9c03af77a47e11ac7f301cde40c8a 100644 (file)
--- a/src/cmd/go/internal/module/module.go
+++ b/src/cmd/go/internal/module/module.go
@@ -6,11 +6,18 @@
  // along with support code.
  package module
  
+// IMPORTANT NOTE
+//
+// This file essentially defines the set of valid import paths for the go command.
+// There are many subtle considerations, including Unicode ambiguity,
+// security, network, and file system representations.
+//
+// Changes to the semantics in this file require approval from rsc.
+
  import (
         "fmt"
         "sort"
         "strings"
-       "unicode"
         "unicode/utf8"
  
         "cmd/go/internal/semver"
@@ -71,36 +78,33 @@ func Check(path, version string) error {
  
  // firstPathOK reports whether r can appear in the first element of a module path.
  // The first element of the path must be an LDH domain name, at least for now.
+// To avoid case ambiguity, the domain name must be entirely lower case.
  func firstPathOK(r rune) bool {
         return r == '-' || r == '.' ||
                 '0' <= r && r <= '9' ||
-               'A' <= r && r <= 'Z' ||
                 'a' <= r && r <= 'z'
  }
  
  // pathOK reports whether r can appear in a module path.
-// Paths must avoid potentially problematic ASCII punctuation
-// and control characters but otherwise can be any Unicode printable character,
-// as defined by Go's IsPrint.
+// Paths can be ASCII letters, ASCII digits, and limited ASCII punctuation: + - . / _ and ~.
+// This matches what "go get" has historically recognized in import paths.
+// TODO(rsc): We would like to allow Unicode letters, but that requires additional
+// care in the safe encoding (see note below).
  func pathOK(r rune) bool {
         if r < utf8.RuneSelf {
-               return r == '+' || r == ',' || r == '-' || r == '.' || r == '/' || r == '_' || r == '~' ||
+               return r == '+' || r == '-' || r == '.' || r == '/' || r == '_' || r == '~' ||
                         '0' <= r && r <= '9' ||
                         'A' <= r && r <= 'Z' ||
                         'a' <= r && r <= 'z'
         }
-       return unicode.IsPrint(r)
+       return false
  }
  
  // CheckPath checks that a module path is valid.
  func CheckPath(path string) error {
-       if !utf8.ValidString(path) {
-               return fmt.Errorf("malformed module path %q: invalid UTF-8", path)
+       if err := checkImportPath(path); err != nil {
+               return fmt.Errorf("malformed module path %q: %v", path, err)
         }
-       if path == "" {
-               return fmt.Errorf("malformed module path %q: empty string", path)
-       }
-
         i := strings.Index(path, "/")
         if i < 0 {
                 i = len(path)
@@ -111,40 +115,119 @@ func CheckPath(path string) error {
         if !strings.Contains(path[:i], ".") {
                 return fmt.Errorf("malformed module path %q: missing dot in first path element", path)
         }
-       if path[i-1] == '.' {
-               return fmt.Errorf("malformed module path %q: trailing dot in first path element", path)
-       }
-       if path[0] == '.' {
-               return fmt.Errorf("malformed module path %q: leading dot in first path element", path)
-       }
         if path[0] == '-' {
                 return fmt.Errorf("malformed module path %q: leading dash in first path element", path)
         }
-       if strings.Contains(path, "..") {
-               return fmt.Errorf("malformed module path %q: double dot", path)
-       }
-       if strings.Contains(path, "//") {
-               return fmt.Errorf("malformed module path %q: double slash", path)
-       }
         for _, r := range path[:i] {
                 if !firstPathOK(r) {
                         return fmt.Errorf("malformed module path %q: invalid char %q in first path element", path, r)
                 }
         }
+       if _, _, ok := SplitPathVersion(path); !ok {
+               return fmt.Errorf("malformed module path %q: invalid version %s", path, path[strings.LastIndex(path, "/")+1:])
+       }
+       return nil
+}
+
+// CheckImportPath checks that an import path is valid.
+func CheckImportPath(path string) error {
+       if err := checkImportPath(path); err != nil {
+               return fmt.Errorf("malformed import path %q: %v", path, err)
+       }
+       return nil
+}
+
+// checkImportPath checks that an import path is valid.
+// It returns an error describing why but not mentioning path.
+// Because these checks apply to both module paths and import paths,
+// the caller is expected to add the "malformed ___ path %q: " prefix.
+func checkImportPath(path string) error {
+       if !utf8.ValidString(path) {
+               return fmt.Errorf("invalid UTF-8")
+       }
+       if path == "" {
+               return fmt.Errorf("empty string")
+       }
+       if strings.Contains(path, "..") {
+               return fmt.Errorf("double dot")
+       }
+       if strings.Contains(path, "//") {
+               return fmt.Errorf("double slash")
+       }
         if path[len(path)-1] == '/' {
-               return fmt.Errorf("malformed module path %q: trailing slash", path)
+               return fmt.Errorf("trailing slash")
         }
-       for _, r := range path {
+       elemStart := 0
+       for i, r := range path {
                 if !pathOK(r) {
-                       return fmt.Errorf("malformed module path %q: invalid char %q", path, r)
+                       return fmt.Errorf("invalid char %q", r)
+               }
+               if r == '/' {
+                       if err := checkElem(path[elemStart:i]); err != nil {
+                               return err
+                       }
+                       elemStart = i + 1
                 }
         }
-       if _, _, ok := SplitPathVersion(path); !ok {
-               return fmt.Errorf("malformed module path %q: invalid version %s", path, path[strings.LastIndex(path, "/")+1:])
+       if err := checkElem(path[elemStart:]); err != nil {
+               return err
+       }
+       return nil
+}
+
+// checkElem checks whether an individual path element is valid.
+func checkElem(elem string) error {
+       if elem == "" {
+               return fmt.Errorf("empty path element")
+       }
+       if elem[0] == '.' {
+               return fmt.Errorf("leading dot in path element")
+       }
+       if elem[len(elem)-1] == '.' {
+               return fmt.Errorf("trailing dot in path element")
+       }
+
+       // Windows disallows a bunch of path elements, sadly.
+       // See https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file
+       short := elem
+       if i := strings.Index(short, "."); i >= 0 {
+               short = short[:i]
+       }
+       for _, bad := range badWindowsNames {
+               if strings.EqualFold(bad, short) {
+                       return fmt.Errorf("disallowed path element %q", elem)
+               }
         }
         return nil
  }
  
+// badWindowsNames are the reserved file path elements on Windows.
+// See https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file
+var badWindowsNames = []string{
+       "CON",
+       "PRN",
+       "AUX",
+       "NUL",
+       "COM1",
+       "COM2",
+       "COM3",
+       "COM4",
+       "COM5",
+       "COM6",
+       "COM7",
+       "COM8",
+       "COM9",
+       "LPT1",
+       "LPT2",
+       "LPT3",
+       "LPT4",
+       "LPT5",
+       "LPT6",
+       "LPT7",
+       "LPT8",
+       "LPT9",
+}
+
  // SplitPathVersion returns prefix and major version such that prefix+pathMajor == path
  // and version is either empty or "/vN" for N >= 2.
  // As a special case, gopkg.in paths are recognized directly;
@@ -228,3 +311,137 @@ func Sort(list []Version) {
                 return fi < fj
         })
  }
+
+// Safe encodings
+//
+// Module paths appear as substrings of file system paths
+// (in the download cache) and of web server URLs in the proxy protocol.
+// In general we cannot rely on file systems to be case-sensitive,
+// nor can we rely on web servers, since they read from file systems.
+// That is, we cannot rely on the file system to keep rsc.io/QUOTE
+// and rsc.io/quote separate. Windows and macOS don't.
+// Instead, we must never require two different casings of a file path.
+// Because we want the download cache to match the proxy protocol,
+// and because we want the proxy protocol to be possible to serve
+// from a tree of static files (which might be stored on a case-insensitive
+// file system), the proxy protocol must never require two different casings
+// of a URL path either.
+//
+// One possibility would be to make the safe encoding be the lowercase
+// hexadecimal encoding of the actual path bytes. This would avoid ever
+// needing different casings of a file path, but it would be fairly illegible
+// to most programmers when those paths appeared in the file system
+// (including in file paths in compiler errors and stack traces)
+// in web server logs, and so on. Instead, we want a safe encoding that
+// leaves most paths unaltered.
+//
+// The safe encoding is this:
+// replace every uppercase letter with an exclamation mark
+// followed by the letter's lowercase equivalent.
+//
+// For example,
+// github.com/Azure/azure-sdk-for-go ->  github.com/!azure/azure-sdk-for-go.
+// github.com/GoogleCloudPlatform/cloudsql-proxy -> github.com/!google!cloud!platform/cloudsql-proxy
+// github.com/Sirupsen/logrus -> github.com/!sirupsen/logrus.
+//
+// Import paths that avoid upper-case letters are left unchanged.
+// Note that because import paths are ASCII-only and avoid various
+// problematic punctuation (like : < and >), the safe encoding is also ASCII-only
+// and avoids the same problematic punctuation.
+//
+// Import paths have never allowed exclamation marks, so there is no
+// need to define how to encode a literal !.
+//
+// Although paths are disallowed from using Unicode (see pathOK above),
+// the eventual plan is to allow Unicode letters as well, to assume that
+// file systems and URLs are Unicode-safe (storing UTF-8), and apply
+// the !-for-uppercase convention. Note however that not all runes that
+// are different but case-fold equivalent are an upper/lower pair.
+// For example, U+004B ('K'), U+006B ('k'), and U+212A ('K' for Kelvin)
+// are considered to case-fold to each other. When we do add Unicode
+// letters, we must not assume that upper/lower are the only case-equivalent pairs.
+// Perhaps the Kelvin symbol would be disallowed entirely, for example.
+// Or perhaps it would encode as "!!k", or perhaps as "(212A)".
+//
+// Also, it would be nice to allow Unicode marks as well as letters,
+// but marks include combining marks, and then we must deal not
+// only with case folding but also normalization: both U+00E9 ('é')
+// and U+0065 U+0301 ('e' followed by combining acute accent)
+// look the same on the page and are treated by some file systems
+// as the same path. If we do allow Unicode marks in paths, there
+// must be some kind of normalization to allow only one canonical
+// encoding of any character used in an import path.
+
+// EncodePath returns the safe encoding of the given module path.
+// It fails if the module path is invalid.
+func EncodePath(path string) (encoding string, err error) {
+       if err := CheckPath(path); err != nil {
+               return "", err
+       }
+
+       haveUpper := false
+       for _, r := range path {
+               if r == '!' || r >= utf8.RuneSelf {
+                       // This should be disallowed by CheckPath, but diagnose anyway.
+                       // The correctness of the encoding loop below depends on it.
+                       return "", fmt.Errorf("internal error: inconsistency in EncodePath")
+               }
+               if 'A' <= r && r <= 'Z' {
+                       haveUpper = true
+               }
+       }
+
+       if !haveUpper {
+               return path, nil
+       }
+
+       var buf []byte
+       for _, r := range path {
+               if 'A' <= r && r <= 'Z' {
+                       buf = append(buf, '!', byte(r+'a'-'A'))
+               } else {
+                       buf = append(buf, byte(r))
+               }
+       }
+       return string(buf), nil
+}
+
+// DecodePath returns the module path of the given safe encoding.
+// It fails if the encoding is invalid.
+func DecodePath(encoding string) (path string, err error) {
+       var buf []byte
+
+       bang := false
+       for _, r := range encoding {
+               if r >= utf8.RuneSelf {
+                       goto BadEncoding
+               }
+               if bang {
+                       bang = false
+                       if r < 'a' || 'z' < r {
+                               goto BadEncoding
+                       }
+                       buf = append(buf, byte(r+'A'-'a'))
+                       continue
+               }
+               if r == '!' {
+                       bang = true
+                       continue
+               }
+               if 'A' <= r && r <= 'Z' {
+                       goto BadEncoding
+               }
+               buf = append(buf, byte(r))
+       }
+       if bang {
+               goto BadEncoding
+       }
+       path = string(buf)
+       if err := CheckPath(path); err != nil {
+               return "", fmt.Errorf("invalid module path encoding %q: %v", encoding, err)
+       }
+       return path, nil
+
+BadEncoding:
+       return "", fmt.Errorf("invalid module path encoding %q", encoding)
+}
diff --git a/src/cmd/go/internal/module/module_test.go b/src/cmd/go/internal/module/module_test.go

index 6142a9e048f47b1bf38aa29d204ac3dc1046eae7..972835f1bc251d719090bb752be45a2c21fbbf73 100644 (file)
--- a/src/cmd/go/internal/module/module_test.go
+++ b/src/cmd/go/internal/module/module_test.go
@@ -55,92 +55,111 @@ func TestCheck(t *testing.T) {
  }
  
  var checkPathTests = []struct {
-       path string
-       ok   bool
+       path     string
+       ok       bool
+       importOK bool
  }{
-       {"x.y/z", true},
-       {"x.y", true},
-
-       {"", false},
-       {"x.y/\xFFz", false},
-       {"/x.y/z", false},
-       {"x./z", false},
-       {".x/z", false},
-       {"-x/z", false},
-       {"x..y/z", false},
-       {"x.y/z/../../w", false},
-       {"x.y//z", false},
-       {"x.y/z//w", false},
-       {"x.y/z/", false},
-
-       {"x.y/z/v0", false},
-       {"x.y/z/v1", false},
-       {"x.y/z/v2", true},
-       {"x.y/z/v2.0", false},
-
-       {"!x.y/z", false},
-       {"_x.y/z", false},
-       {"x.y!/z", false},
-       {"x.y\"/z", false},
-       {"x.y#/z", false},
-       {"x.y$/z", false},
-       {"x.y%/z", false},
-       {"x.y&/z", false},
-       {"x.y'/z", false},
-       {"x.y(/z", false},
-       {"x.y)/z", false},
-       {"x.y*/z", false},
-       {"x.y+/z", false},
-       {"x.y,/z", false},
-       {"x.y-/z", true},
-       {"x.y./zt", false},
-       {"x.y:/z", false},
-       {"x.y;/z", false},
-       {"x.y</z", false},
-       {"x.y=/z", false},
-       {"x.y>/z", false},
-       {"x.y?/z", false},
-       {"x.y@/z", false},
-       {"x.y[/z", false},
-       {"x.y\\/z", false},
-       {"x.y]/z", false},
-       {"x.y^/z", false},
-       {"x.y_/z", false},
-       {"x.y`/z", false},
-       {"x.y{/z", false},
-       {"x.y}/z", false},
-       {"x.y~/z", false},
-       {"x.y/z!", false},
-       {"x.y/z\"", false},
-       {"x.y/z#", false},
-       {"x.y/z$", false},
-       {"x.y/z%", false},
-       {"x.y/z&", false},
-       {"x.y/z'", false},
-       {"x.y/z(", false},
-       {"x.y/z)", false},
-       {"x.y/z*", false},
-       {"x.y/z+", true},
-       {"x.y/z,", true},
-       {"x.y/z-", true},
-       {"x.y/z.t", true},
-       {"x.y/z/t", true},
-       {"x.y/z:", false},
-       {"x.y/z;", false},
-       {"x.y/z<", false},
-       {"x.y/z=", false},
-       {"x.y/z>", false},
-       {"x.y/z?", false},
-       {"x.y/z@", false},
-       {"x.y/z[", false},
-       {"x.y/z\\", false},
-       {"x.y/z]", false},
-       {"x.y/z^", false},
-       {"x.y/z_", true},
-       {"x.y/z`", false},
-       {"x.y/z{", false},
-       {"x.y/z}", false},
-       {"x.y/z~", true},
+       {"x.y/z", true, true},
+       {"x.y", true, true},
+
+       {"", false, false},
+       {"x.y/\xFFz", false, false},
+       {"/x.y/z", false, false},
+       {"x./z", false, false},
+       {".x/z", false, false},
+       {"-x/z", false, true},
+       {"x..y/z", false, false},
+       {"x.y/z/../../w", false, false},
+       {"x.y//z", false, false},
+       {"x.y/z//w", false, false},
+       {"x.y/z/", false, false},
+
+       {"x.y/z/v0", false, true},
+       {"x.y/z/v1", false, true},
+       {"x.y/z/v2", true, true},
+       {"x.y/z/v2.0", false, true},
+       {"X.y/z", false, true},
+
+       {"!x.y/z", false, false},
+       {"_x.y/z", false, true},
+       {"x.y!/z", false, false},
+       {"x.y\"/z", false, false},
+       {"x.y#/z", false, false},
+       {"x.y$/z", false, false},
+       {"x.y%/z", false, false},
+       {"x.y&/z", false, false},
+       {"x.y'/z", false, false},
+       {"x.y(/z", false, false},
+       {"x.y)/z", false, false},
+       {"x.y*/z", false, false},
+       {"x.y+/z", false, true},
+       {"x.y,/z", false, false},
+       {"x.y-/z", true, true},
+       {"x.y./zt", false, false},
+       {"x.y:/z", false, false},
+       {"x.y;/z", false, false},
+       {"x.y</z", false, false},
+       {"x.y=/z", false, false},
+       {"x.y>/z", false, false},
+       {"x.y?/z", false, false},
+       {"x.y@/z", false, false},
+       {"x.y[/z", false, false},
+       {"x.y\\/z", false, false},
+       {"x.y]/z", false, false},
+       {"x.y^/z", false, false},
+       {"x.y_/z", false, true},
+       {"x.y`/z", false, false},
+       {"x.y{/z", false, false},
+       {"x.y}/z", false, false},
+       {"x.y~/z", false, true},
+       {"x.y/z!", false, false},
+       {"x.y/z\"", false, false},
+       {"x.y/z#", false, false},
+       {"x.y/z$", false, false},
+       {"x.y/z%", false, false},
+       {"x.y/z&", false, false},
+       {"x.y/z'", false, false},
+       {"x.y/z(", false, false},
+       {"x.y/z)", false, false},
+       {"x.y/z*", false, false},
+       {"x.y/z+", true, true},
+       {"x.y/z,", false, false},
+       {"x.y/z-", true, true},
+       {"x.y/z.t", true, true},
+       {"x.y/z/t", true, true},
+       {"x.y/z:", false, false},
+       {"x.y/z;", false, false},
+       {"x.y/z<", false, false},
+       {"x.y/z=", false, false},
+       {"x.y/z>", false, false},
+       {"x.y/z?", false, false},
+       {"x.y/z@", false, false},
+       {"x.y/z[", false, false},
+       {"x.y/z\\", false, false},
+       {"x.y/z]", false, false},
+       {"x.y/z^", false, false},
+       {"x.y/z_", true, true},
+       {"x.y/z`", false, false},
+       {"x.y/z{", false, false},
+       {"x.y/z}", false, false},
+       {"x.y/z~", true, true},
+       {"x.y/x.foo", true, true},
+       {"x.y/aux.foo", false, false},
+       {"x.y/prn", false, false},
+       {"x.y/prn2", true, true},
+       {"x.y/com", true, true},
+       {"x.y/com1", false, false},
+       {"x.y/com1.txt", false, false},
+       {"x.y/calm1", true, true},
+       {"github.com/!123/logrus", false, false},
+
+       // TODO: CL 41822 allowed Unicode letters in old "go get"
+       // without due consideration of the implications, and only on github.com (!).
+       // For now, we disallow non-ASCII characters in module mode,
+       // in both module paths and general import paths,
+       // until we can get the implications right.
+       // When we do, we'll enable them everywhere, not just for GitHub.
+       {"github.com/user/unicode/испытание", false, false},
  }
  
  func TestCheckPath(t *testing.T) {
@@ -151,6 +170,13 @@ func TestCheckPath(t *testing.T) {
                 } else if !tt.ok && err == nil {
                         t.Errorf("CheckPath(%q) succeeded, wanted error", tt.path)
                 }
+
+               err = CheckImportPath(tt.path)
+               if tt.importOK && err != nil {
+                       t.Errorf("CheckImportPath(%q) = %v, wanted nil error", tt.path, err)
+               } else if !tt.importOK && err == nil {
+                       t.Errorf("CheckImportPath(%q) succeeded, wanted error", tt.path)
+               }
         }
  }
  
@@ -182,3 +208,84 @@ func TestSplitPathVersion(t *testing.T) {
                 }
         }
  }
+
+var encodeTests = []struct {
+       path string
+       enc  string // empty means same as path
+}{
+       {path: "ascii.com/abcdefghijklmnopqrstuvwxyz.-+/~_0123456789"},
+       {path: "github.com/GoogleCloudPlatform/omega", enc: "github.com/!google!cloud!platform/omega"},
+}
+
+func TestEncodePath(t *testing.T) {
+       // Check invalid paths.
+       for _, tt := range checkPathTests {
+               if !tt.ok {
+                       _, err := EncodePath(tt.path)
+                       if err == nil {
+                               t.Errorf("EncodePath(%q): succeeded, want error (invalid path)", tt.path)
+                       }
+               }
+       }
+
+       // Check encodings.
+       for _, tt := range encodeTests {
+               enc, err := EncodePath(tt.path)
+               if err != nil {
+                       t.Errorf("EncodePath(%q): unexpected error: %v", tt.path, err)
+                       continue
+               }
+               want := tt.enc
+               if want == "" {
+                       want = tt.path
+               }
+               if enc != want {
+                       t.Errorf("EncodePath(%q) = %q, want %q", tt.path, enc, want)
+               }
+       }
+}
+
+var badDecode = []string{
+       "github.com/GoogleCloudPlatform/omega",
+       "github.com/!google!cloud!platform!/omega",
+       "github.com/!0google!cloud!platform/omega",
+       "github.com/!_google!cloud!platform/omega",
+       "github.com/!!google!cloud!platform/omega",
+       "",
+}
+
+func TestDecodePath(t *testing.T) {
+       // Check invalid decodings.
+       for _, bad := range badDecode {
+               _, err := DecodePath(bad)
+               if err == nil {
+                       t.Errorf("DecodePath(%q): succeeded, want error (invalid decoding)", bad)
+               }
+       }
+
+       // Check invalid paths (or maybe decodings).
+       for _, tt := range checkPathTests {
+               if !tt.ok {
+                       path, err := DecodePath(tt.path)
+                       if err == nil {
+                               t.Errorf("DecodePath(%q) = %q, want error (invalid path)", tt.path, path)
+                       }
+               }
+       }
+
+       // Check encodings.
+       for _, tt := range encodeTests {
+               enc := tt.enc
+               if enc == "" {
+                       enc = tt.path
+               }
+               path, err := DecodePath(enc)
+               if err != nil {
+                       t.Errorf("DecodePath(%q): unexpected error: %v", enc, err)
+                       continue
+               }
+               if path != tt.path {
+                       t.Errorf("DecodePath(%q) = %q, want %q", enc, path, tt.path)
+               }
+       }
+}
author	Russ Cox <rsc@golang.org>
	Mon, 16 Jul 2018 02:56:37 +0000 (22:56 -0400)
committer	Russ Cox <rsc@golang.org>
	Wed, 18 Jul 2018 02:08:57 +0000 (02:08 +0000)
src/cmd/go/internal/module/module.go		patch \| blob \| history
src/cmd/go/internal/module/module_test.go		patch \| blob \| history