From: Tim King <taking@google.com>
Date: Thu, 14 Nov 2024 20:04:39 +0000 (-0800)
Subject: internal/exportdata, cmd/compile/internal/noder: merge export data handling
X-Git-Tag: go1.24rc1~113
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=0edea47f264a4185d78e00e1e9e977d99f5c997b;p=gostls13.git

internal/exportdata, cmd/compile/internal/noder: merge export data handling

Unify how go/types, types2, and noder read in unified export data from
GC-created files.

This splits FindExportData into smaller pieces for improved code
sharing.
- FindPackageDefinition finds the package definition file in the ar
  archive.
- ReadObjectHeaders reads the object headers.
- ReadExportDataHeader reads the export data format header.

There is a new convenience wrapper ReadUnified that combines all of
these. This documents the expected archive contents.

Updates noder and the importers to use these.
This also adjusts when end-of-section marker ("\n$$\n") checking happens.

Change-Id: Iec2179b0a1ae7f69eb12d077018f731116a77f13
Reviewed-on: https://go-review.googlesource.com/c/go/+/628155
Reviewed-by: Robert Griesemer <gri@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Commit-Queue: Tim King <taking@google.com>
---

diff --git a/src/cmd/compile/internal/importer/gcimporter.go b/src/cmd/compile/internal/importer/gcimporter.go
index 9af257730d..e0aec98231 100644
--- a/src/cmd/compile/internal/importer/gcimporter.go
+++ b/src/cmd/compile/internal/importer/gcimporter.go
@@ -11,10 +11,8 @@ import (
 	"fmt"
 	"internal/exportdata"
 	"internal/pkgbits"
-	"internal/saferio"
 	"io"
 	"os"
-	"strings"
 
 	"cmd/compile/internal/types2"
 )
@@ -75,50 +73,15 @@ func Import(packages map[string]*types2.Package, path, srcDir string, lookup fun
 	defer rc.Close()
 
 	buf := bufio.NewReader(rc)
-	hdr, size, err := exportdata.FindExportData(buf)
+	data, err := exportdata.ReadUnified(buf)
 	if err != nil {
+		err = fmt.Errorf("import %q: %v", path, err)
 		return
 	}
+	s := string(data)
 
-	switch hdr {
-	case "$$\n":
-		err = fmt.Errorf("import %q: old textual export format no longer supported (recompile package)", path)
-
-	case "$$B\n":
-		var exportFormat byte
-		if exportFormat, err = buf.ReadByte(); err != nil {
-			return
-		}
-		size--
-
-		// The unified export format starts with a 'u'; the indexed export
-		// format starts with an 'i'; and the older binary export format
-		// starts with a 'c', 'd', or 'v' (from "version"). Select
-		// appropriate importer.
-		switch exportFormat {
-		case 'u':
-			// exported strings may contain "\n$$\n" - search backwards
-			var data []byte
-			var r io.Reader = buf
-			if size >= 0 {
-				if data, err = saferio.ReadData(r, uint64(size)); err != nil {
-					return
-				}
-			} else if data, err = io.ReadAll(r); err != nil {
-				return
-			}
-			s := string(data)
-			s = s[:strings.LastIndex(s, "\n$$\n")]
-
-			input := pkgbits.NewPkgDecoder(id, s)
-			pkg = ReadPackage(nil, packages, input)
-		default:
-			err = fmt.Errorf("import %q: binary export format %q is no longer supported (recompile package)", path, exportFormat)
-		}
-
-	default:
-		err = fmt.Errorf("import %q: unknown export data header: %q", path, hdr)
-	}
+	input := pkgbits.NewPkgDecoder(id, s)
+	pkg = ReadPackage(nil, packages, input)
 
 	return
 }
diff --git a/src/cmd/compile/internal/noder/import.go b/src/cmd/compile/internal/noder/import.go
index 964b01ec42..910988f061 100644
--- a/src/cmd/compile/internal/noder/import.go
+++ b/src/cmd/compile/internal/noder/import.go
@@ -8,6 +8,7 @@ import (
 	"errors"
 	"fmt"
 	"internal/buildcfg"
+	"internal/exportdata"
 	"internal/pkgbits"
 	"os"
 	pathpkg "path"
@@ -22,7 +23,6 @@ import (
 	"cmd/compile/internal/typecheck"
 	"cmd/compile/internal/types"
 	"cmd/compile/internal/types2"
-	"cmd/internal/archive"
 	"cmd/internal/bio"
 	"cmd/internal/goobj"
 	"cmd/internal/objabi"
@@ -207,7 +207,7 @@ func readImportFile(path string, target *ir.Package, env *types2.Context, packag
 	}
 	defer f.Close()
 
-	r, end, err := findExportData(f)
+	data, err := readExportData(f)
 	if err != nil {
 		return
 	}
@@ -216,94 +216,63 @@ func readImportFile(path string, target *ir.Package, env *types2.Context, packag
 		fmt.Printf("importing %s (%s)\n", path, f.Name())
 	}
 
-	c, err := r.ReadByte()
-	if err != nil {
-		return
-	}
+	pr := pkgbits.NewPkgDecoder(pkg1.Path, data)
 
-	pos := r.Offset()
-
-	// Map export data section into memory as a single large
-	// string. This reduces heap fragmentation and allows returning
-	// individual substrings very efficiently.
-	var data string
-	data, err = base.MapFile(r.File(), pos, end-pos)
-	if err != nil {
-		return
-	}
-
-	switch c {
-	case 'u':
-		// TODO(mdempsky): This seems a bit clunky.
-		data = strings.TrimSuffix(data, "\n$$\n")
-
-		pr := pkgbits.NewPkgDecoder(pkg1.Path, data)
-
-		// Read package descriptors for both types2 and compiler backend.
-		readPackage(newPkgReader(pr), pkg1, false)
-		pkg2 = importer.ReadPackage(env, packages, pr)
-
-	default:
-		// Indexed format is distinguished by an 'i' byte,
-		// whereas previous export formats started with 'c', 'd', or 'v'.
-		err = fmt.Errorf("unexpected package format byte: %v", c)
-		return
-	}
+	// Read package descriptors for both types2 and compiler backend.
+	readPackage(newPkgReader(pr), pkg1, false)
+	pkg2 = importer.ReadPackage(env, packages, pr)
 
-	err = addFingerprint(path, f, end)
+	err = addFingerprint(path, data)
 	return
 }
 
-// findExportData returns a *bio.Reader positioned at the start of the
-// binary export data section, and a file offset for where to stop
-// reading.
-func findExportData(f *os.File) (r *bio.Reader, end int64, err error) {
-	r = bio.NewReader(f)
+// readExportData returns the contents of GC-created unified export data.
+func readExportData(f *os.File) (data string, err error) {
+	r := bio.NewReader(f)
 
-	// check object header
-	line, err := r.ReadString('\n')
+	sz, err := exportdata.FindPackageDefinition(r.Reader)
 	if err != nil {
 		return
 	}
+	end := r.Offset() + int64(sz)
 
-	// Is the first line an archive file signature?
-	if line != "!<arch>\n" {
-		err = fmt.Errorf("not the start of an archive file (%q)", line)
+	abihdr, _, err := exportdata.ReadObjectHeaders(r.Reader)
+	if err != nil {
 		return
 	}
 
-	// package export block should be first
-	sz := int64(archive.ReadHeader(r.Reader, "__.PKGDEF"))
-	if sz <= 0 {
-		err = errors.New("not a package file")
+	if expect := objabi.HeaderString(); abihdr != expect {
+		err = fmt.Errorf("object is [%s] expected [%s]", abihdr, expect)
 		return
 	}
-	end = r.Offset() + sz
-	line, err = r.ReadString('\n')
+
+	_, err = exportdata.ReadExportDataHeader(r.Reader)
 	if err != nil {
 		return
 	}
 
-	if !strings.HasPrefix(line, "go object ") {
-		err = fmt.Errorf("not a go object file: %s", line)
-		return
-	}
-	if expect := objabi.HeaderString(); line != expect {
-		err = fmt.Errorf("object is [%s] expected [%s]", line, expect)
+	pos := r.Offset()
+
+	// Map export data section (+ end-of-section marker) into memory
+	// as a single large string. This reduces heap fragmentation and
+	// allows returning individual substrings very efficiently.
+	var mapped string
+	mapped, err = base.MapFile(r.File(), pos, end-pos)
+	if err != nil {
 		return
 	}
 
-	// process header lines
-	for !strings.HasPrefix(line, "$$") {
-		line, err = r.ReadString('\n')
-		if err != nil {
-			return
-		}
-	}
+	// check for end-of-section marker "\n$$\n" and remove it
+	const marker = "\n$$\n"
 
-	// Expect $$B\n to signal binary import format.
-	if line != "$$B\n" {
-		err = errors.New("old export format no longer supported (recompile package)")
+	var ok bool
+	data, ok = strings.CutSuffix(mapped, marker)
+	if !ok {
+		cutoff := data // include last 10 bytes in error message
+		if len(cutoff) >= 10 {
+			cutoff = cutoff[len(cutoff)-10:]
+		}
+		err = fmt.Errorf("expected $$ marker, but found %q (recompile package)", cutoff)
 		return
 	}
 
@@ -312,24 +281,16 @@ func findExportData(f *os.File) (r *bio.Reader, end int64, err error) {
 
 // addFingerprint reads the linker fingerprint included at the end of
 // the exportdata.
-func addFingerprint(path string, f *os.File, end int64) error {
-	const eom = "\n$$\n"
+func addFingerprint(path string, data string) error {
 	var fingerprint goobj.FingerprintType
 
-	var buf [len(fingerprint) + len(eom)]byte
-	if _, err := f.ReadAt(buf[:], end-int64(len(buf))); err != nil {
-		return err
-	}
-
-	// Caller should have given us the end position of the export data,
-	// which should end with the "\n$$\n" marker. As a consistency check
-	// to make sure we're reading at the right offset, make sure we
-	// found the marker.
-	if s := string(buf[len(fingerprint):]); s != eom {
-		return fmt.Errorf("expected $$ marker, but found %q", s)
+	pos := len(data) - len(fingerprint)
+	if pos < 0 {
+		return fmt.Errorf("missing linker fingerprint in exportdata, but found %q", data)
 	}
+	buf := []byte(data[pos:])
 
-	copy(fingerprint[:], buf[:])
+	copy(fingerprint[:], buf)
 	base.Ctxt.AddImport(path, fingerprint)
 
 	return nil
diff --git a/src/go/internal/gcimporter/gcimporter.go b/src/go/internal/gcimporter/gcimporter.go
index 451afe6fd5..ed5e5dcacd 100644
--- a/src/go/internal/gcimporter/gcimporter.go
+++ b/src/go/internal/gcimporter/gcimporter.go
@@ -12,10 +12,8 @@ import (
 	"go/types"
 	"internal/exportdata"
 	"internal/pkgbits"
-	"internal/saferio"
 	"io"
 	"os"
-	"strings"
 )
 
 // Import imports a gc-generated package given its import path and srcDir, adds
@@ -72,49 +70,15 @@ func Import(fset *token.FileSet, packages map[string]*types.Package, path, srcDi
 	defer rc.Close()
 
 	buf := bufio.NewReader(rc)
-	hdr, size, err := exportdata.FindExportData(buf)
+	data, err := exportdata.ReadUnified(buf)
 	if err != nil {
+		err = fmt.Errorf("import %q: %v", path, err)
 		return
 	}
+	s := string(data)
 
-	switch hdr {
-	case "$$\n":
-		err = fmt.Errorf("import %q: old textual export format no longer supported (recompile package)", path)
-
-	case "$$B\n":
-		var exportFormat byte
-		if exportFormat, err = buf.ReadByte(); err != nil {
-			return
-		}
-		size--
-
-		// The unified export format starts with a 'u'; the indexed export
-		// format starts with an 'i'; and the older binary export format
-		// starts with a 'c', 'd', or 'v' (from "version"). Select
-		// appropriate importer.
-		switch exportFormat {
-		case 'u':
-			var data []byte
-			var r io.Reader = buf
-			if size >= 0 {
-				if data, err = saferio.ReadData(r, uint64(size)); err != nil {
-					return
-				}
-			} else if data, err = io.ReadAll(r); err != nil {
-				return
-			}
-			s := string(data)
-			s = s[:strings.LastIndex(s, "\n$$\n")]
-
-			input := pkgbits.NewPkgDecoder(id, s)
-			pkg = readUnifiedPackage(fset, nil, packages, input)
-		default:
-			err = fmt.Errorf("import %q: binary export format %q is no longer supported (recompile package)", path, exportFormat)
-		}
-
-	default:
-		err = fmt.Errorf("import %q: unknown export data header: %q", path, hdr)
-	}
+	input := pkgbits.NewPkgDecoder(id, s)
+	pkg = readUnifiedPackage(fset, nil, packages, input)
 
 	return
 }
diff --git a/src/internal/exportdata/exportdata.go b/src/internal/exportdata/exportdata.go
index 5cd7cb18c2..27675923b5 100644
--- a/src/internal/exportdata/exportdata.go
+++ b/src/internal/exportdata/exportdata.go
@@ -6,12 +6,16 @@
 // and reading gc-generated object files.
 package exportdata
 
+// This file should be kept in sync with src/cmd/compile/internal/gc/obj.go .
+
 import (
 	"bufio"
 	"bytes"
 	"errors"
 	"fmt"
 	"go/build"
+	"internal/saferio"
+	"io"
 	"os"
 	"os/exec"
 	"path/filepath"
@@ -19,13 +23,100 @@ import (
 	"sync"
 )
 
-// FindExportData positions the reader r at the beginning of the
-// export data section of an underlying GC-created object/archive
-// file by reading from it. The reader must be positioned at the
-// start of the file before calling this function. The hdr result
-// is the string before the export data, either "$$" or "$$B".
-func FindExportData(r *bufio.Reader) (hdr string, size int, err error) {
-	// TODO(taking): Merge with cmd/compile/internal/noder.findExportData.
+// ReadUnified reads the contents of the unified export data from a reader r
+// that contains the contents of a GC-created archive file.
+//
+// On success, the reader will be positioned after the end-of-section marker "\n$$\n".
+//
+// Supported GC-created archive files have 4 layers of nesting:
+//   - An archive file containing a package definition file.
+//   - The package definition file contains headers followed by a data section.
+//     Headers are lines (â¤ 4kb) that do not start with "$$".
+//   - The data section starts with "$$B\n" followed by export data followed
+//     by an end of section marker "\n$$\n". (The section start "$$\n" is no
+//     longer supported.)
+//   - The export data starts with a format byte ('u') followed by the <data> in
+//     the given format. (See ReadExportDataHeader for older formats.)
+//
+// Putting this together, the bytes in a GC-created archive files are expected
+// to look like the following.
+// See cmd/internal/archive for more details on ar file headers.
+//
+// | <!arch>\n             | ar file signature
+// | __.PKGDEF...size...\n | ar header for __.PKGDEF including size.
+// | go object <...>\n     | objabi header
+// | <optional headers>\n  | other headers such as build id
+// | $$B\n                 | binary format marker
+// | u<data>\n             | unified export <data>
+// | $$\n                  | end-of-section marker
+// | [optional padding]    | padding byte (0x0A) if size is odd
+// | [ar file header]      | other ar files
+// | [ar file data]        |
+func ReadUnified(r *bufio.Reader) (data []byte, err error) {
+	// We historically guaranteed headers at the default buffer size (4096) work.
+	// This ensures we can use ReadSlice throughout.
+	const minBufferSize = 4096
+	r = bufio.NewReaderSize(r, minBufferSize)
+
+	size, err := FindPackageDefinition(r)
+	if err != nil {
+		return
+	}
+	n := size
+
+	objapi, headers, err := ReadObjectHeaders(r)
+	if err != nil {
+		return
+	}
+	n -= len(objapi)
+	for _, h := range headers {
+		n -= len(h)
+	}
+
+	hdrlen, err := ReadExportDataHeader(r)
+	if err != nil {
+		return
+	}
+	n -= hdrlen
+
+	// size also includes the end of section marker. Remove that many bytes from the end.
+	const marker = "\n$$\n"
+	n -= len(marker)
+
+	if n < 0 {
+		err = fmt.Errorf("invalid size (%d) in the archive file: %d bytes remain without section headers (recompile package)", size, n)
+	}
+
+	// Read n bytes from buf.
+	data, err = saferio.ReadData(r, uint64(n))
+	if err != nil {
+		return
+	}
+
+	// Check for marker at the end.
+	var suffix [len(marker)]byte
+	_, err = io.ReadFull(r, suffix[:])
+	if err != nil {
+		return
+	}
+	if s := string(suffix[:]); s != marker {
+		err = fmt.Errorf("read %q instead of end-of-section marker (%q)", s, marker)
+		return
+	}
+
+	return
+}
+
+// FindPackageDefinition positions the reader r at the beginning of a package
+// definition file ("__.PKGDEF") within a GC-created archive by reading
+// from it, and returns the size of the package definition file in the archive.
+//
+// The reader must be positioned at the start of the archive file before calling
+// this function, and "__.PKGDEF" is assumed to be the first file in the archive.
+//
+// See cmd/internal/archive for details on the archive format.
+func FindPackageDefinition(r *bufio.Reader) (size int, err error) {
+	// Uses ReadSlice to limit risk of malformed inputs.
 
 	// Read first line to make sure this is an object file.
 	line, err := r.ReadSlice('\n')
@@ -47,31 +138,96 @@ func FindExportData(r *bufio.Reader) (hdr string, size int, err error) {
 		return
 	}
 
-	// Read first line of __.PKGDEF data, so that line
-	// is once again the first line of the input.
+	return
+}
+
+// ReadObjectHeaders reads object headers from the reader. Object headers are
+// lines that do not start with an end-of-section marker "$$". The first header
+// is the objabi header. On success, the reader will be positioned at the beginning
+// of the end-of-section marker.
+//
+// It returns an error if any header does not fit in r.Size() bytes.
+func ReadObjectHeaders(r *bufio.Reader) (objapi string, headers []string, err error) {
+	// line is a temporary buffer for headers.
+	// Use bounded reads (ReadSlice, Peek) to limit risk of malformed inputs.
+	var line []byte
+
+	// objapi header should be the first line
 	if line, err = r.ReadSlice('\n'); err != nil {
 		err = fmt.Errorf("can't find export data (%v)", err)
 		return
 	}
+	objapi = string(line)
+
+	// objapi header begins with "go object ".
+	if !strings.HasPrefix(objapi, "go object ") {
+		err = fmt.Errorf("not a go object file: %s", objapi)
+		return
+	}
+
+	// process remaining object header lines
+	for {
+		// check for an end of section marker "$$"
+		line, err = r.Peek(2)
+		if err != nil {
+			return
+		}
+		if string(line) == "$$" {
+			return // stop
+		}
+
+		// read next header
+		line, err = r.ReadSlice('\n')
+		if err != nil {
+			return
+		}
+		headers = append(headers, string(line))
+	}
+}
 
-	// Now at __.PKGDEF in archive. line should begin with "go object ".
-	if !strings.HasPrefix(string(line), "go object ") {
-		err = fmt.Errorf("not a Go object file")
+// ReadExportDataHeader reads the export data header and format from r.
+// It returns the number of bytes read, or an error if the format is no longer
+// supported or it failed to read.
+//
+// The only currently supported format is binary export data in the
+// unified export format.
+func ReadExportDataHeader(r *bufio.Reader) (n int, err error) {
+	// Read export data header.
+	line, err := r.ReadSlice('\n')
+	if err != nil {
 		return
 	}
-	size -= len(line)
 
-	// Skip over object header to export data.
-	// Begins after first line starting with $$.
-	for line[0] != '$' {
-		if line, err = r.ReadSlice('\n'); err != nil {
-			err = fmt.Errorf("can't find export data (%v)", err)
+	hdr := string(line)
+	switch hdr {
+	case "$$\n":
+		err = fmt.Errorf("old textual export format no longer supported (recompile package)")
+		return
+
+	case "$$B\n":
+		var format byte
+		format, err = r.ReadByte()
+		if err != nil {
+			return
+		}
+		// The unified export format starts with a 'u'.
+		switch format {
+		case 'u':
+		default:
+			// Older no longer supported export formats include:
+			// indexed export format which started with an 'i'; and
+			// the older binary export format which started with a 'c',
+			// 'd', or 'v' (from "version").
+			err = fmt.Errorf("binary export format %q is no longer supported (recompile package)", format)
 			return
 		}
-		size -= len(line)
+
+	default:
+		err = fmt.Errorf("unknown export data header: %q", hdr)
+		return
 	}
-	hdr = string(line)
 
+	n = len(hdr) + 1 // + 1 is for 'u'
 	return
 }