From: Sergey Matveev Date: Thu, 27 Jun 2024 16:44:18 +0000 (+0300) Subject: Deterministic pax archiver X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=2741cf4c8955e3f2f232ae85f47fe4ddc5a2a57c90f9c47442af707f0c546ae9;p=bass.git Deterministic pax archiver --- diff --git a/build/contrib/detpax/.gitignore b/build/contrib/detpax/.gitignore new file mode 100644 index 0000000..f65c310 --- /dev/null +++ b/build/contrib/detpax/.gitignore @@ -0,0 +1 @@ +/detpax diff --git a/build/contrib/detpax/README b/build/contrib/detpax/README new file mode 100644 index 0000000..00e3680 --- /dev/null +++ b/build/contrib/detpax/README @@ -0,0 +1,21 @@ +detpax -- deterministic pax-format archiver + +Reproducible package building expects packages to be bit to bit +identical on each rebuilding. Ordinary tar archive invocation will +store file mtimes, UIDs/GIDs, which ruins reproducible builds. Even +filesystem may list files in directories in different order. + +https://reproducible-builds.org/docs/archives/ shows various +possibilities of reproducible ustar archive creation. Unfortunately +not everything (like big files) can be saved in ustar format. Forced +pax format creation leads to inclusion of precise timestamps in bsdtar. +And there are no options to skip that behaviour. + +detpax stores only minimal set of metainformation. Directories walking +is done in a sorted way. No UIDs/GIDs or any kind of timestamps are +stored, but permissions with sticky/setuid/setgid bits. + +It also has ability to set higher precedence ordering for some +directories. For example BASS packages stores their dependencies as +hook-scripts in skelpkg subdirectory. With "-prec skelpkg" it will +be saved in archive first for quicker finding them in archives. diff --git a/build/contrib/detpax/basic.t b/build/contrib/detpax/basic.t new file mode 100755 index 0000000..337e465 --- /dev/null +++ b/build/contrib/detpax/basic.t @@ -0,0 +1,106 @@ +#!/bin/sh + +testname=`basename "$0"` +test_description="Basic tests" +. $SHARNESS_TEST_SRCDIR/sharness.sh + +mkdir prefix +cd prefix + +list() { + tar tvf - | perl -ane ' + print "$F[0] "; + my $i=1; + for (; $i < $#F; $i++) { + last if (substr($F[$i], 0, 6) eq "prefix"); + }; + print join " ", @F[$i..$#F]; + print "\n"; + ' +} + +mkdir dir +echo hello >dir/hw +mkdir dir/subdir +ln dir/hw dir/subdir/hw-linked +ln -s unexistent dir/symlink + +mkdir prec0 +touch prec0/file +ln dir/hw prec0/hw-linked + +mkdir prec0/prec1 +touch prec0/prec1/file +ln dir/hw prec0/hw-linked + +cd .. +echo world >just-a-file +chmod -R go-rwx prefix just-a-file +chmod g+rx prefix +chmod +t prefix/dir +chmod o+w prefix/prec0 +chmod o+r prefix/dir/hw + +detpax prefix/ just-a-file | list >their +cat >our < unexistent +-rw------- prefix/prec0/file +-rw------- prefix/prec0/prec1/file +hrw----r-- prefix/dir/subdir/hw-linked link to prefix/dir/hw +hrw----r-- prefix/prec0/hw-linked link to prefix/dir/hw +-rw------- just-a-file +EOF +test_expect_success "basic" "test_cmp our their" + +detpax -prec prec0 prefix/ just-a-file | list >their +cat >our < unexistent +hrw----r-- prefix/dir/subdir/hw-linked link to prefix/dir/hw +-rw------- just-a-file +EOF +test_expect_success "prec0" "test_cmp our their" + +detpax -prec prec0/prec1 -prec prec0 prefix/ just-a-file | + tee arch.tar | list >their +cat >our < unexistent +hrw----r-- prefix/dir/subdir/hw-linked link to prefix/dir/hw +-rw------- just-a-file +EOF +test_expect_success "prec1" "test_cmp our their" + +mkdir tmp +cd tmp +test_expect_success "unpack" "tar xf ../arch.tar" + +echo hello >our +test_expect_success "hw.txt" "test_cmp our prefix/dir/hw" + +echo world >our +test_expect_success "just-a-file" "test_cmp our just-a-file" + +test_done diff --git a/build/contrib/detpax/build b/build/contrib/detpax/build new file mode 100755 index 0000000..342cc03 --- /dev/null +++ b/build/contrib/detpax/build @@ -0,0 +1,3 @@ +#!/bin/sh + +exec go build -ldflags=-s -o detpax diff --git a/build/contrib/detpax/go.mod b/build/contrib/detpax/go.mod new file mode 100644 index 0000000..872f6cd --- /dev/null +++ b/build/contrib/detpax/go.mod @@ -0,0 +1,3 @@ +module go.cypherpunks.ru/bass/detpax + +go 1.22.4 diff --git a/build/contrib/detpax/main.go b/build/contrib/detpax/main.go new file mode 100644 index 0000000..03ff1b5 --- /dev/null +++ b/build/contrib/detpax/main.go @@ -0,0 +1,228 @@ +// Copyright (C) 2024 Sergey Matveev +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 3 of the License. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + +package main + +import ( + "archive/tar" + "bufio" + "flag" + "fmt" + "io" + "io/fs" + "log" + "os" + "sort" + "strings" + "syscall" +) + +const ( + SkelpkgPrefix = "skelpkg" + DefBufSize = 1 << 20 + OrderForLinks = 1<<31 - 1 + OrderForOther = OrderForLinks - 1 +) + +type Info struct { + name string + link string + size int64 + mode fs.FileMode + order int +} + +type Inode struct{ dev, ino uint64 } + +func walk( + root string, + precs []string, + files map[string]*Info, + inodes map[Inode]*Info, +) { + fd, err := os.Open(root) + if err != nil { + log.Fatalln("can not open:", root, err) + } + defer fd.Close() + fi, err := fd.Stat() + if err != nil { + log.Fatalln("can not stat:", root, err) + } + files[root] = &Info{name: root, mode: fi.Mode()} + fis, err := fd.Readdir(0) + if err != nil { + log.Fatalln("can not readdir:", root, err) + } + sort.Slice(fis, func(i, j int) bool { + return fis[i].Name() < fis[j].Name() + }) + defOrder := OrderForLinks + for i, prec := range precs { + if strings.HasPrefix(root, prec) { + defOrder = 1 + 1 + 2*i + break + } + } + for _, fi := range fis { + if fi.IsDir() { + walk(root+"/"+fi.Name(), precs, files, inodes) + continue + } + info := Info{name: root + "/" + fi.Name(), mode: fi.Mode()} + if (info.mode & fs.ModeSymlink) > 0 { + info.link, err = os.Readlink(info.name) + if err != nil { + log.Fatalln("can not Readlink:", info.name, err) + } + info.order = OrderForOther + files[info.name] = &info + continue + } + if !info.mode.IsRegular() { + continue + } + s := fi.Sys().(*syscall.Stat_t) + inode := Inode{s.Dev, s.Ino} + link := inodes[inode] + info.order = defOrder + if link == nil { + inodes[inode] = &info + info.size = fi.Size() + if info.order == OrderForLinks { + info.order = OrderForOther + } + } else { + files[link.name].order = info.order - 1 + info.link = link.name + } + files[info.name] = &info + } +} + +func main() { + var precsOrig []string + flag.Func("prec", "Add directory with higher precedence", + func(s string) error { + if s != "" { + precsOrig = append(precsOrig, s) + } + return nil + }) + flag.Usage = func() { + fmt.Fprintln(flag.CommandLine.Output(), + `Usage: detpax [-prec PREC0] [-prec PRECx] {FILE|DIR} [...] + +Precedence directories must not include DIR name. +PREC0 has higher precedence than following ones. + -prec foo/bar -prec foo dir +will put dir/foo/bar first, dir/foo second, others next.`) + flag.PrintDefaults() + } + flag.Parse() + var fd *os.File + buf := make([]byte, DefBufSize) + bw := bufio.NewWriterSize(os.Stdout, DefBufSize) + w := tar.NewWriter(bw) + var err error + for _, root := range flag.Args() { + root = strings.TrimSuffix(root, "/") + precs := make([]string, 0, len(precsOrig)) + for _, prec := range precsOrig { + precs = append(precs, root+"/"+prec) + } + var files []*Info + { + filesMap := map[string]*Info{} + { + fi, err := os.Lstat(root) + if err != nil { + log.Fatalln("can not stat:", root, err) + } + if fi.IsDir() { + inodes := map[Inode]*Info{} + walk(root, precs, filesMap, inodes) + } else if !fi.Mode().IsRegular() { + log.Fatalln("only directory or file expected:", root) + } else { + filesMap[root] = &Info{ + name: root, + mode: fi.Mode(), + size: fi.Size(), + } + } + } + files = make([]*Info, 0, len(filesMap)) + for _, info := range filesMap { + files = append(files, info) + } + } + sort.Slice(files, func(i, j int) bool { + if files[i].order == files[j].order { + return files[i].name < files[j].name + } + return files[i].order < files[j].order + }) + + hdr := tar.Header{Format: tar.FormatPAX} + for _, info := range files { + hdr.Name = info.name + hdr.Mode = int64(info.mode & 0777) + if info.mode&fs.ModeSticky > 0 { + hdr.Mode |= 01000 + } + if info.mode&fs.ModeSetgid > 0 { + hdr.Mode |= 02000 + } + if info.mode&fs.ModeSetuid > 0 { + hdr.Mode |= 04000 + } + hdr.Size = info.size + if info.link == "" { + hdr.Linkname = "" + } else { + hdr.Linkname = info.link + } + if info.mode&fs.ModeDir > 0 { + hdr.Typeflag = tar.TypeDir + } else if info.mode&fs.ModeSymlink > 0 { + hdr.Typeflag = tar.TypeSymlink + } else if info.link == "" { + hdr.Typeflag = tar.TypeReg + } else { + hdr.Typeflag = tar.TypeLink + } + if err = w.WriteHeader(&hdr); err != nil { + log.Fatalln("can not WriteHeader:", err) + } + if hdr.Typeflag != tar.TypeReg { + continue + } + fd, err = os.Open(info.name) + if err != nil { + log.Fatalln(err) + } + if _, err = io.CopyBuffer(w, fd, buf); err != nil { + log.Fatalln(err) + } + fd.Close() + } + } + if err = w.Close(); err != nil { + log.Fatalln(err) + } + if err = bw.Flush(); err != nil { + log.Fatalln(err) + } +} diff --git a/build/lib/mk-pkg b/build/lib/mk-pkg index 343977e..4a75290 100755 --- a/build/lib/mk-pkg +++ b/build/lib/mk-pkg @@ -25,12 +25,13 @@ [ -n "$BASS_ROOT" ] || BASS_ROOT="$(dirname "$(realpath -- "$0")")"/../.. sname="$0" . "$BASS_ROOT"/lib/rc -BASS_BIRTHTIME="2024-02-12 14:08:37" - namenhash="$1" [ -d "$namenhash" ] shift +[ -x "$BASS_ROOT"/build/contrib/detpax/detpax ] || + ( cd "$BASS_ROOT"/build/contrib/detpax ; ./build ) + [ -n "$BASS_NOSYNC" ] || find $namenhash -type f -or -type d -exec $FSYNC {} + tmp=$(mktemp -d $TMPDIR/$namenhash-mk-pkg.XXXXXX) @@ -45,21 +46,9 @@ EOF meta4ra-create -no-published -no-generator -hashes "$META4RA_HASHES" \ -fn buildinfo <$tmp/buildinfo >$tmp/buildinfo.meta4 -# https://reproducible-builds.org/docs/archives/ -# bsdtar tries to use ustar format by default, adding pax-headers only -# if ustar is incapable of storing something (very long names, big files). -# Explicit pax format usage leads to atime/ctime fields inclusion. -# Forced mtime date is birthtime of the BASS. chmod -R a-w $namenhash -find $namenhash -exec touch -h -d "$BASS_BIRTHTIME" {} + -find $namenhash -print0 | LC_ALL=C sort -z >$tmp/tar-list -{ - perl -0 -F/ -lane 'print if $F[1] eq "skelpkg"' $tmp/tar-list - perl -0 -F/ -lane 'print unless $F[1] eq "skelpkg"' $tmp/tar-list -} >$tmp/tar-list-sorted -mv $tmp/tar-list-sorted $tmp/tar-list { - $TAR cfTn - $tmp/tar-list --null --uid=0 --gid=0 --numeric-owner || + "$BASS_ROOT"/build/contrib/detpax/detpax -prec skelpkg $namenhash || touch $tmp/tar-pipe-failed } | { $COMPRESSOR || touch $tmp/tar-pipe-failed ; } | @@ -74,5 +63,4 @@ mv $namenhash bin mv $namenhash.meta4 bin.meta4 totar="name name.meta4 buildinfo buildinfo.meta4 bin.meta4 bin" chmod a-w $totar -touch -h -d "$BASS_BIRTHTIME" $totar -$TAR cf - --uid=0 --gid=0 --numeric-owner $totar +"$BASS_ROOT"/build/contrib/detpax/detpax $totar diff --git a/build/lib/zip-hash/go.mod b/build/lib/zip-hash/go.mod index 4efe152..6b6b9bc 100644 --- a/build/lib/zip-hash/go.mod +++ b/build/lib/zip-hash/go.mod @@ -1,4 +1,4 @@ -module rnd.stcnet.ru/zip-hash +module go.cypherpunks.ru/bass/zip-hash go 1.22.1 diff --git a/doc/build/skelpkg.texi b/doc/build/skelpkg.texi index 4ae9eed..1e51052 100644 --- a/doc/build/skelpkg.texi +++ b/doc/build/skelpkg.texi @@ -36,14 +36,13 @@ single file, uncompressed POSIX pax archive with following entries: @cindex pax archive @cindex ustar archive +@pindex detpax POSIX ustar archive format can not hold more than 8GiB of data and (very) long filenames. Forced pax usage guarantees compatibility with variety of OSes. GNU tar's format (also not having limitations above) easily -could be unreadable on non-GNU systems. Unfortunately forced pax causes -including of @command{atime}/@command{ctime} metainformation, that can -not be omitted from @command{bsdtar} CLI. They prevent byte-to-byte -deterministic archives creation. So we actually use ustar, that will be -upgraded to pax if necessary (too long filenames or sizes). +could be unreadable on non-GNU systems. BASS uses +@command{build/contrib/detpax} archiver for creating pax archives in +deterministic bit-to-bit reproducible way. As pax/tar does not have any kind of index, as ZIP does, it is crucial to place the largest @file{bin} file at the very end of the archive. And