This reduces the go tool binary on arm64 by 12k.
go1 results on Amberwing:
name old time/op new time/op delta
RegexpMatchEasy0_32 249ns ± 0% 249ns ± 0% ~ (p=0.087 n=10+10)
RegexpMatchEasy0_1K 584ns ± 0% 584ns ± 0% ~ (all equal)
RegexpMatchEasy1_32 246ns ± 0% 246ns ± 0% ~ (p=1.000 n=10+10)
RegexpMatchEasy1_1K 806ns ± 0% 806ns ± 0% ~ (p=0.706 n=10+9)
RegexpMatchMedium_32 314ns ± 0% 314ns ± 0% ~ (all equal)
RegexpMatchMedium_1K 52.1µs ± 0% 52.1µs ± 0% ~ (p=0.245 n=10+8)
RegexpMatchHard_32 2.75µs ± 1% 2.75µs ± 1% ~ (p=0.690 n=10+10)
RegexpMatchHard_1K 78.9µs ± 0% 78.9µs ± 1% ~ (p=0.295 n=9+9)
FmtFprintfEmpty 58.5ns ± 0% 58.5ns ± 0% ~ (all equal)
FmtFprintfString 112ns ± 0% 112ns ± 0% ~ (all equal)
FmtFprintfInt 117ns ± 0% 116ns ± 0% -0.85% (p=0.000 n=10+10)
FmtFprintfIntInt 181ns ± 0% 181ns ± 0% ~ (all equal)
FmtFprintfPrefixedInt 222ns ± 0% 224ns ± 0% +0.90% (p=0.000 n=9+10)
FmtFprintfFloat 318ns ± 1% 322ns ± 0% ~ (p=0.059 n=10+8)
FmtManyArgs 736ns ± 1% 735ns ± 0% ~ (p=0.206 n=9+9)
Gzip 437ms ± 0% 436ms ± 0% -0.25% (p=0.000 n=10+10)
HTTPClientServer 89.8µs ± 1% 90.2µs ± 2% ~ (p=0.393 n=10+10)
JSONEncode 20.1ms ± 1% 20.2ms ± 1% ~ (p=0.065 n=9+10)
JSONDecode 94.2ms ± 1% 93.9ms ± 1% -0.42% (p=0.043 n=10+10)
GobDecode 12.7ms ± 1% 12.8ms ± 2% +0.94% (p=0.019 n=10+10)
GobEncode 12.1ms ± 0% 12.1ms ± 0% ~ (p=0.052 n=10+10)
Mandelbrot200 5.06ms ± 0% 5.05ms ± 0% -0.04% (p=0.000 n=9+10)
TimeParse 450ns ± 3% 446ns ± 0% ~ (p=0.238 n=10+9)
TimeFormat 485ns ± 1% 483ns ± 1% ~ (p=0.073 n=10+10)
Template 90.4ms ± 0% 90.7ms ± 0% +0.29% (p=0.000 n=8+10)
GoParse 6.01ms ± 0% 6.03ms ± 0% +0.35% (p=0.000 n=10+10)
BinaryTree17 11.7s ± 0% 11.7s ± 0% ~ (p=0.481 n=10+10)
Revcomp 669ms ± 0% 669ms ± 0% ~ (p=0.315 n=10+10)
Fannkuch11 3.40s ± 0% 3.37s ± 0% -0.92% (p=0.000 n=10+10)
[Geo mean] 67.9µs 67.9µs +0.02%
name old speed new speed delta
RegexpMatchEasy0_32 128MB/s ± 0% 128MB/s ± 0% -0.08% (p=0.003 n=8+10)
RegexpMatchEasy0_1K 1.75GB/s ± 0% 1.75GB/s ± 0% ~ (p=0.642 n=8+10)
RegexpMatchEasy1_32 130MB/s ± 0% 130MB/s ± 0% ~ (p=0.690 n=10+9)
RegexpMatchEasy1_1K 1.27GB/s ± 0% 1.27GB/s ± 0% ~ (p=0.661 n=10+9)
RegexpMatchMedium_32 3.18MB/s ± 0% 3.18MB/s ± 0% ~ (all equal)
RegexpMatchMedium_1K 19.7MB/s ± 0% 19.6MB/s ± 0% ~ (p=0.190 n=10+9)
RegexpMatchHard_32 11.6MB/s ± 0% 11.6MB/s ± 1% ~ (p=0.669 n=10+10)
RegexpMatchHard_1K 13.0MB/s ± 0% 13.0MB/s ± 0% ~ (p=0.718 n=9+9)
Gzip 44.4MB/s ± 0% 44.5MB/s ± 0% +0.24% (p=0.000 n=10+10)
JSONEncode 96.5MB/s ± 1% 96.1MB/s ± 1% ~ (p=0.065 n=9+10)
JSONDecode 20.6MB/s ± 1% 20.7MB/s ± 1% +0.42% (p=0.041 n=10+10)
GobDecode 60.6MB/s ± 1% 60.0MB/s ± 2% -0.92% (p=0.016 n=10+10)
GobEncode 63.4MB/s ± 0% 63.6MB/s ± 0% ~ (p=0.055 n=10+10)
Template 21.5MB/s ± 0% 21.4MB/s ± 0% -0.30% (p=0.000 n=9+10)
GoParse 9.64MB/s ± 0% 9.61MB/s ± 0% -0.36% (p=0.000 n=10+10)
Revcomp 380MB/s ± 0% 380MB/s ± 0% ~ (p=0.323 n=10+10)
[Geo mean] 56.0MB/s 55.9MB/s -0.07%
Change-Id: Ia732fa57fbcf4767d72382516d9f16705d177736
Reviewed-on: https://go-review.googlesource.com/96435
Run-TryBot: Cherry Zhang <cherryyz@google.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
`,
pos: []string{"\tCSEL\t"},
},
+ // Check that zero stores are combine into larger stores
+ {
+ fn: `
+ func $(b []byte) {
+ _ = b[1] // early bounds check to guarantee safety of writes below
+ b[0] = 0
+ b[1] = 0
+ }
+ `,
+ pos: []string{"MOVH\tZR"},
+ neg: []string{"MOVB"},
+ },
+ {
+ fn: `
+ func $(b []byte) {
+ _ = b[1] // early bounds check to guarantee safety of writes below
+ b[1] = 0
+ b[0] = 0
+ }
+ `,
+ pos: []string{"MOVH\tZR"},
+ neg: []string{"MOVB"},
+ },
+ {
+ fn: `
+ func $(b []byte) {
+ _ = b[3] // early bounds check to guarantee safety of writes below
+ b[0] = 0
+ b[1] = 0
+ b[2] = 0
+ b[3] = 0
+ }
+ `,
+ pos: []string{"MOVW\tZR"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(b []byte) {
+ _ = b[3] // early bounds check to guarantee safety of writes below
+ b[2] = 0
+ b[3] = 0
+ b[1] = 0
+ b[0] = 0
+ }
+ `,
+ pos: []string{"MOVW\tZR"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(h []uint16) {
+ _ = h[1] // early bounds check to guarantee safety of writes below
+ h[0] = 0
+ h[1] = 0
+ }
+ `,
+ pos: []string{"MOVW\tZR"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(h []uint16) {
+ _ = h[1] // early bounds check to guarantee safety of writes below
+ h[1] = 0
+ h[0] = 0
+ }
+ `,
+ pos: []string{"MOVW\tZR"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(b []byte) {
+ _ = b[7] // early bounds check to guarantee safety of writes below
+ b[0] = 0
+ b[1] = 0
+ b[2] = 0
+ b[3] = 0
+ b[4] = 0
+ b[5] = 0
+ b[6] = 0
+ b[7] = 0
+ }
+ `,
+ pos: []string{"MOVD\tZR"},
+ neg: []string{"MOVB", "MOVH", "MOVW"},
+ },
+ {
+ fn: `
+ func $(h []uint16) {
+ _ = h[3] // early bounds check to guarantee safety of writes below
+ h[0] = 0
+ h[1] = 0
+ h[2] = 0
+ h[3] = 0
+ }
+ `,
+ pos: []string{"MOVD\tZR"},
+ neg: []string{"MOVB", "MOVH", "MOVW"},
+ },
+ {
+ fn: `
+ func $(h []uint16) {
+ _ = h[3] // early bounds check to guarantee safety of writes below
+ h[2] = 0
+ h[3] = 0
+ h[1] = 0
+ h[0] = 0
+ }
+ `,
+ pos: []string{"MOVD\tZR"},
+ neg: []string{"MOVB", "MOVH", "MOVW"},
+ },
+ {
+ fn: `
+ func $(w []uint32) {
+ _ = w[1] // early bounds check to guarantee safety of writes below
+ w[0] = 0
+ w[1] = 0
+ }
+ `,
+ pos: []string{"MOVD\tZR"},
+ neg: []string{"MOVB", "MOVH", "MOVW"},
+ },
+ {
+ fn: `
+ func $(w []uint32) {
+ _ = w[1] // early bounds check to guarantee safety of writes below
+ w[1] = 0
+ w[0] = 0
+ }
+ `,
+ pos: []string{"MOVD\tZR"},
+ neg: []string{"MOVB", "MOVH", "MOVW"},
+ },
+ {
+ fn: `
+ func $(b []byte) {
+ _ = b[15] // early bounds check to guarantee safety of writes below
+ b[0] = 0
+ b[1] = 0
+ b[2] = 0
+ b[3] = 0
+ b[4] = 0
+ b[5] = 0
+ b[6] = 0
+ b[7] = 0
+ b[8] = 0
+ b[9] = 0
+ b[10] = 0
+ b[11] = 0
+ b[12] = 0
+ b[13] = 0
+ b[15] = 0
+ b[14] = 0
+ }
+ `,
+ pos: []string{"STP"},
+ neg: []string{"MOVB", "MOVH", "MOVW"},
+ },
+ {
+ fn: `
+ func $(h []uint16) {
+ _ = h[7] // early bounds check to guarantee safety of writes below
+ h[0] = 0
+ h[1] = 0
+ h[2] = 0
+ h[3] = 0
+ h[4] = 0
+ h[5] = 0
+ h[6] = 0
+ h[7] = 0
+ }
+ `,
+ pos: []string{"STP"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(w []uint32) {
+ _ = w[3] // early bounds check to guarantee safety of writes below
+ w[0] = 0
+ w[1] = 0
+ w[2] = 0
+ w[3] = 0
+ }
+ `,
+ pos: []string{"STP"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(w []uint32) {
+ _ = w[3] // early bounds check to guarantee safety of writes below
+ w[1] = 0
+ w[0] = 0
+ w[3] = 0
+ w[2] = 0
+ }
+ `,
+ pos: []string{"STP"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(d []uint64) {
+ _ = d[1] // early bounds check to guarantee safety of writes below
+ d[0] = 0
+ d[1] = 0
+ }
+ `,
+ pos: []string{"STP"},
+ neg: []string{"MOVB", "MOVH"},
+ },
+ {
+ fn: `
+ func $(d []uint64) {
+ _ = d[1] // early bounds check to guarantee safety of writes below
+ d[1] = 0
+ d[0] = 0
+ }
+ `,
+ pos: []string{"STP"},
+ neg: []string{"MOVB", "MOVH"},
+ },
}
var linuxMIPSTests = []*asmTest{
&& clobber(o4) && clobber(o5) && clobber(s0)
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (REV <t> (MOVDload <t> {s} (OffPtr <p.Type> [i0] p) mem))
+// Combine zero stores into larger (unaligned) stores.
+(MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(i,j,1)
+ && is32Bit(min(i,j))
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ -> (MOVHstorezero [min(i,j)] {s} ptr0 mem)
+(MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(i,j,2)
+ && is32Bit(min(i,j))
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ -> (MOVWstorezero [min(i,j)] {s} ptr0 mem)
+(MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(i,j,4)
+ && is32Bit(min(i,j))
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ -> (MOVDstorezero [min(i,j)] {s} ptr0 mem)
+(MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem))
+ && x.Uses == 1
+ && areAdjacentOffsets(i,j,8)
+ && is32Bit(min(i,j))
+ && isSamePtr(ptr0, ptr1)
+ && clobber(x)
+ -> (MOVQstorezero [min(i,j)] {s} ptr0 mem)
+
// FP simplification
(FNEGS (FMULS x y)) -> (FNMULS x y)
(FNEGD (FMULD x y)) -> (FNMULD x y)
return false
}
+func areAdjacentOffsets(off1, off2, size int64) bool {
+ return off1+size == off2 || off1 == off2+size
+}
+
// check if value zeroes out upper 32-bit of 64-bit register.
// depth limits recursion depth. In AMD64.rules 3 is used as limit,
// because it catches same amount of cases as 4.
v.AddArg(mem)
return true
}
+ // match: (MOVBstorezero [i] {s} ptr0 x:(MOVBstorezero [j] {s} ptr1 mem))
+ // cond: x.Uses == 1 && areAdjacentOffsets(i,j,1) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
+ // result: (MOVHstorezero [min(i,j)] {s} ptr0 mem)
+ for {
+ i := v.AuxInt
+ s := v.Aux
+ _ = v.Args[1]
+ ptr0 := v.Args[0]
+ x := v.Args[1]
+ if x.Op != OpARM64MOVBstorezero {
+ break
+ }
+ j := x.AuxInt
+ if x.Aux != s {
+ break
+ }
+ _ = x.Args[1]
+ ptr1 := x.Args[0]
+ mem := x.Args[1]
+ if !(x.Uses == 1 && areAdjacentOffsets(i, j, 1) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
+ break
+ }
+ v.reset(OpARM64MOVHstorezero)
+ v.AuxInt = min(i, j)
+ v.Aux = s
+ v.AddArg(ptr0)
+ v.AddArg(mem)
+ return true
+ }
return false
}
func rewriteValueARM64_OpARM64MOVDload_0(v *Value) bool {
v.AddArg(mem)
return true
}
+ // match: (MOVDstorezero [i] {s} ptr0 x:(MOVDstorezero [j] {s} ptr1 mem))
+ // cond: x.Uses == 1 && areAdjacentOffsets(i,j,8) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
+ // result: (MOVQstorezero [min(i,j)] {s} ptr0 mem)
+ for {
+ i := v.AuxInt
+ s := v.Aux
+ _ = v.Args[1]
+ ptr0 := v.Args[0]
+ x := v.Args[1]
+ if x.Op != OpARM64MOVDstorezero {
+ break
+ }
+ j := x.AuxInt
+ if x.Aux != s {
+ break
+ }
+ _ = x.Args[1]
+ ptr1 := x.Args[0]
+ mem := x.Args[1]
+ if !(x.Uses == 1 && areAdjacentOffsets(i, j, 8) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
+ break
+ }
+ v.reset(OpARM64MOVQstorezero)
+ v.AuxInt = min(i, j)
+ v.Aux = s
+ v.AddArg(ptr0)
+ v.AddArg(mem)
+ return true
+ }
return false
}
func rewriteValueARM64_OpARM64MOVHUload_0(v *Value) bool {
v.AddArg(mem)
return true
}
+ // match: (MOVHstorezero [i] {s} ptr0 x:(MOVHstorezero [j] {s} ptr1 mem))
+ // cond: x.Uses == 1 && areAdjacentOffsets(i,j,2) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
+ // result: (MOVWstorezero [min(i,j)] {s} ptr0 mem)
+ for {
+ i := v.AuxInt
+ s := v.Aux
+ _ = v.Args[1]
+ ptr0 := v.Args[0]
+ x := v.Args[1]
+ if x.Op != OpARM64MOVHstorezero {
+ break
+ }
+ j := x.AuxInt
+ if x.Aux != s {
+ break
+ }
+ _ = x.Args[1]
+ ptr1 := x.Args[0]
+ mem := x.Args[1]
+ if !(x.Uses == 1 && areAdjacentOffsets(i, j, 2) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
+ break
+ }
+ v.reset(OpARM64MOVWstorezero)
+ v.AuxInt = min(i, j)
+ v.Aux = s
+ v.AddArg(ptr0)
+ v.AddArg(mem)
+ return true
+ }
return false
}
func rewriteValueARM64_OpARM64MOVQstorezero_0(v *Value) bool {
v.AddArg(mem)
return true
}
+ // match: (MOVWstorezero [i] {s} ptr0 x:(MOVWstorezero [j] {s} ptr1 mem))
+ // cond: x.Uses == 1 && areAdjacentOffsets(i,j,4) && is32Bit(min(i,j)) && isSamePtr(ptr0, ptr1) && clobber(x)
+ // result: (MOVDstorezero [min(i,j)] {s} ptr0 mem)
+ for {
+ i := v.AuxInt
+ s := v.Aux
+ _ = v.Args[1]
+ ptr0 := v.Args[0]
+ x := v.Args[1]
+ if x.Op != OpARM64MOVWstorezero {
+ break
+ }
+ j := x.AuxInt
+ if x.Aux != s {
+ break
+ }
+ _ = x.Args[1]
+ ptr1 := x.Args[0]
+ mem := x.Args[1]
+ if !(x.Uses == 1 && areAdjacentOffsets(i, j, 4) && is32Bit(min(i, j)) && isSamePtr(ptr0, ptr1) && clobber(x)) {
+ break
+ }
+ v.reset(OpARM64MOVDstorezero)
+ v.AuxInt = min(i, j)
+ v.Aux = s
+ v.AddArg(ptr0)
+ v.AddArg(mem)
+ return true
+ }
return false
}
func rewriteValueARM64_OpARM64MUL_0(v *Value) bool {