A follow on to CL 36963 adding support for ppc64x.
Performance changes (as posted on the issue):
poly1305:
benchmark old ns/op new ns/op delta
Benchmark64-16 172 151 -12.21%
Benchmark1K-16 1828 1523 -16.68%
Benchmark64Unaligned-16 172 151 -12.21%
Benchmark1KUnaligned-16 1827 1523 -16.64%
math:
BenchmarkAcos-16 43.9 39.9 -9.11%
BenchmarkAcosh-16 57.0 45.8 -19.65%
BenchmarkAsin-16 35.8 33.0 -7.82%
BenchmarkAsinh-16 68.6 60.8 -11.37%
BenchmarkAtan-16 19.8 16.2 -18.18%
BenchmarkAtanh-16 65.5 57.5 -12.21%
BenchmarkAtan2-16 45.4 34.2 -24.67%
BenchmarkGamma-16 37.6 26.0 -30.85%
BenchmarkLgamma-16 40.0 28.2 -29.50%
BenchmarkLog1p-16 35.1 29.1 -17.09%
BenchmarkSin-16 22.7 18.4 -18.94%
BenchmarkSincos-16 31.7 23.7 -25.24%
BenchmarkSinh-16 146 131 -10.27%
BenchmarkY0-16 130 107 -17.69%
BenchmarkY1-16 127 107 -15.75%
BenchmarkYn-16 278 235 -15.47%
Updates #17895.
Change-Id: I1c16199715d20c9c4bd97c4a950bcfa69eb688c1
Reviewed-on: https://go-review.googlesource.com/38095
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
imports: []string{"math/bits"},
tests: linuxMIPSTests,
},
+ {
+ arch: "ppc64le",
+ os: "linux",
+ tests: linuxPPC64LETests,
+ },
}
var linuxAMD64Tests = []*asmTest{
},
}
+var linuxPPC64LETests = []*asmTest{
+ // Fused multiply-add/sub instructions.
+ {
+ `
+ func f0(x, y, z float64) float64 {
+ return x * y + z
+ }
+ `,
+ []string{"\tFMADD\t"},
+ },
+ {
+ `
+ func f1(x, y, z float64) float64 {
+ return x * y - z
+ }
+ `,
+ []string{"\tFMSUB\t"},
+ },
+ {
+ `
+ func f2(x, y, z float32) float32 {
+ return x * y + z
+ }
+ `,
+ []string{"\tFMADDS\t"},
+ },
+ {
+ `
+ func f3(x, y, z float32) float32 {
+ return x * y - z
+ }
+ `,
+ []string{"\tFMSUBS\t"},
+ },
+}
+
// TestLineNumber checks to make sure the generated assembly has line numbers
// see issue #16214
func TestLineNumber(t *testing.T) {
ppc64.AFMULS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
ppc64.AFDIV & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
ppc64.AFDIVS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
+ ppc64.AFMADD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
+ ppc64.AFMADDS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
+ ppc64.AFMSUB & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
+ ppc64.AFMSUBS & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
ppc64.AFCTIDZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
ppc64.AFCTIWZ & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
ppc64.AFCFID & obj.AMask: {Flags: gc.SizeF | gc.LeftRead | gc.RegRead | gc.RightWrite},
// Closure pointer is R11 (already)
gc.CheckLoweredGetClosurePtr(v)
+ case ssa.OpPPC64LoweredRound32F, ssa.OpPPC64LoweredRound64F:
+ // input is already rounded
+
case ssa.OpLoadReg:
loadOp := loadByType(v.Type)
p := gc.Prog(loadOp)
p.To.Type = obj.TYPE_REG
p.To.Reg = r
+ case ssa.OpPPC64FMADD, ssa.OpPPC64FMADDS, ssa.OpPPC64FMSUB, ssa.OpPPC64FMSUBS:
+ r := v.Reg()
+ r1 := v.Args[0].Reg()
+ r2 := v.Args[1].Reg()
+ r3 := v.Args[2].Reg()
+ // r = r1*r2 ± r3
+ p := gc.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_REG
+ p.From.Reg = r1
+ p.Reg = r3
+ p.From3 = new(obj.Addr)
+ p.From3.Type = obj.TYPE_REG
+ p.From3.Reg = r2
+ p.To.Type = obj.TYPE_REG
+ p.To.Reg = r
+
case ssa.OpPPC64MaskIfNotCarry:
r := v.Reg()
p := gc.Prog(v.Op.Asm())
(Cvt32Fto64F x) -> x // Note x will have the wrong type for patterns dependent on Float32/Float64
(Cvt64Fto32F x) -> (FRSP x)
-(Round32F x) -> x
-(Round64F x) -> x
+(Round32F x) -> (LoweredRound32F x)
+(Round64F x) -> (LoweredRound64F x)
(Sqrt x) -> (FSQRT x)
// A particular pattern seen in cgo code:
(AND (MOVDconst [c]) x:(MOVBZload _ _)) -> (ANDconst [c&0xFF] x)
(AND x:(MOVBZload _ _) (MOVDconst [c])) -> (ANDconst [c&0xFF] x)
+
+// floating-point fused multiply-add/sub
+(FADD z (FMUL x y)) -> (FMADD x y z)
+(FADD (FMUL x y) z) -> (FMADD x y z)
+(FSUB (FMUL x y) z) -> (FMSUB x y z)
+(FADDS z (FMULS x y)) -> (FMADDS x y z)
+(FADDS (FMULS x y) z) -> (FMADDS x y z)
+(FSUBS (FMULS x y) z) -> (FMSUBS x y z)
fpgp = regInfo{inputs: []regMask{fp}, outputs: []regMask{gp}}
gpfp = regInfo{inputs: []regMask{gp}, outputs: []regMask{fp}}
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
+ fp31 = regInfo{inputs: []regMask{fp, fp, fp}, outputs: []regMask{fp}}
fp2cr = regInfo{inputs: []regMask{fp, fp}}
fpload = regInfo{inputs: []regMask{gp | sp | sb}, outputs: []regMask{fp}}
fpstore = regInfo{inputs: []regMask{gp | sp | sb, fp}}
{name: "FMUL", argLength: 2, reg: fp21, asm: "FMUL", commutative: true}, // arg0*arg1
{name: "FMULS", argLength: 2, reg: fp21, asm: "FMULS", commutative: true}, // arg0*arg1
+ {name: "FMADD", argLength: 3, reg: fp31, asm: "FMADD"}, // arg0*arg1 + arg2
+ {name: "FMADDS", argLength: 3, reg: fp31, asm: "FMADDS"}, // arg0*arg1 + arg2
+ {name: "FMSUB", argLength: 3, reg: fp31, asm: "FMSUB"}, // arg0*arg1 - arg2
+ {name: "FMSUBS", argLength: 3, reg: fp31, asm: "FMSUBS"}, // arg0*arg1 - arg2
+
{name: "SRAD", argLength: 2, reg: gp21, asm: "SRAD"}, // arg0 >>a arg1, 64 bits (all sign if arg1 & 64 != 0)
{name: "SRAW", argLength: 2, reg: gp21, asm: "SRAW"}, // arg0 >>a arg1, 32 bits (all sign if arg1 & 32 != 0)
{name: "SRD", argLength: 2, reg: gp21, asm: "SRD"}, // arg0 >> arg1, 64 bits (0 if arg1 & 64 != 0)
//arg0=ptr,arg1=mem, returns void. Faults if ptr is nil.
{name: "LoweredNilCheck", argLength: 2, reg: regInfo{inputs: []regMask{gp | sp | sb}, clobbers: tmp}, clobberFlags: true, nilCheck: true, faultOnNilArg0: true},
+ // Round ops to block fused-multiply-add extraction.
+ {name: "LoweredRound32F", argLength: 1, reg: fp11, resultInArg0: true},
+ {name: "LoweredRound64F", argLength: 1, reg: fp11, resultInArg0: true},
// Convert pointer to integer, takes a memory operand for ordering.
{name: "MOVDconvert", argLength: 2, reg: gp11, asm: "MOVD"},
OpPPC64MULHWU
OpPPC64FMUL
OpPPC64FMULS
+ OpPPC64FMADD
+ OpPPC64FMADDS
+ OpPPC64FMSUB
+ OpPPC64FMSUBS
OpPPC64SRAD
OpPPC64SRAW
OpPPC64SRD
OpPPC64FGreaterEqual
OpPPC64LoweredGetClosurePtr
OpPPC64LoweredNilCheck
+ OpPPC64LoweredRound32F
+ OpPPC64LoweredRound64F
OpPPC64MOVDconvert
OpPPC64CALLstatic
OpPPC64CALLclosure
},
},
},
+ {
+ name: "FMADD",
+ argLen: 3,
+ asm: ppc64.AFMADD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ outputs: []outputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ },
+ },
+ {
+ name: "FMADDS",
+ argLen: 3,
+ asm: ppc64.AFMADDS,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ outputs: []outputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ },
+ },
+ {
+ name: "FMSUB",
+ argLen: 3,
+ asm: ppc64.AFMSUB,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ outputs: []outputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ },
+ },
+ {
+ name: "FMSUBS",
+ argLen: 3,
+ asm: ppc64.AFMSUBS,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {1, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ {2, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ outputs: []outputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ },
+ },
{
name: "SRAD",
argLen: 2,
clobbers: 2147483648, // R31
},
},
+ {
+ name: "LoweredRound32F",
+ argLen: 1,
+ resultInArg0: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ outputs: []outputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ },
+ },
+ {
+ name: "LoweredRound64F",
+ argLen: 1,
+ resultInArg0: true,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ outputs: []outputInfo{
+ {0, 576460743713488896}, // F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26
+ },
+ },
+ },
{
name: "MOVDconvert",
argLen: 2,
return rewriteValuePPC64_OpPPC64CMPconst(v)
case OpPPC64Equal:
return rewriteValuePPC64_OpPPC64Equal(v)
+ case OpPPC64FADD:
+ return rewriteValuePPC64_OpPPC64FADD(v)
+ case OpPPC64FADDS:
+ return rewriteValuePPC64_OpPPC64FADDS(v)
case OpPPC64FMOVDload:
return rewriteValuePPC64_OpPPC64FMOVDload(v)
case OpPPC64FMOVDstore:
return rewriteValuePPC64_OpPPC64FMOVSload(v)
case OpPPC64FMOVSstore:
return rewriteValuePPC64_OpPPC64FMOVSstore(v)
+ case OpPPC64FSUB:
+ return rewriteValuePPC64_OpPPC64FSUB(v)
+ case OpPPC64FSUBS:
+ return rewriteValuePPC64_OpPPC64FSUBS(v)
case OpPPC64GreaterEqual:
return rewriteValuePPC64_OpPPC64GreaterEqual(v)
case OpPPC64GreaterThan:
}
return false
}
+func rewriteValuePPC64_OpPPC64FADD(v *Value) bool {
+ // match: (FADD z (FMUL x y))
+ // cond:
+ // result: (FMADD x y z)
+ for {
+ z := v.Args[0]
+ v_1 := v.Args[1]
+ if v_1.Op != OpPPC64FMUL {
+ break
+ }
+ x := v_1.Args[0]
+ y := v_1.Args[1]
+ v.reset(OpPPC64FMADD)
+ v.AddArg(x)
+ v.AddArg(y)
+ v.AddArg(z)
+ return true
+ }
+ // match: (FADD (FMUL x y) z)
+ // cond:
+ // result: (FMADD x y z)
+ for {
+ v_0 := v.Args[0]
+ if v_0.Op != OpPPC64FMUL {
+ break
+ }
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ z := v.Args[1]
+ v.reset(OpPPC64FMADD)
+ v.AddArg(x)
+ v.AddArg(y)
+ v.AddArg(z)
+ return true
+ }
+ return false
+}
+func rewriteValuePPC64_OpPPC64FADDS(v *Value) bool {
+ // match: (FADDS z (FMULS x y))
+ // cond:
+ // result: (FMADDS x y z)
+ for {
+ z := v.Args[0]
+ v_1 := v.Args[1]
+ if v_1.Op != OpPPC64FMULS {
+ break
+ }
+ x := v_1.Args[0]
+ y := v_1.Args[1]
+ v.reset(OpPPC64FMADDS)
+ v.AddArg(x)
+ v.AddArg(y)
+ v.AddArg(z)
+ return true
+ }
+ // match: (FADDS (FMULS x y) z)
+ // cond:
+ // result: (FMADDS x y z)
+ for {
+ v_0 := v.Args[0]
+ if v_0.Op != OpPPC64FMULS {
+ break
+ }
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ z := v.Args[1]
+ v.reset(OpPPC64FMADDS)
+ v.AddArg(x)
+ v.AddArg(y)
+ v.AddArg(z)
+ return true
+ }
+ return false
+}
func rewriteValuePPC64_OpPPC64FMOVDload(v *Value) bool {
// match: (FMOVDload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
// cond: canMergeSym(sym1,sym2)
}
return false
}
+func rewriteValuePPC64_OpPPC64FSUB(v *Value) bool {
+ // match: (FSUB (FMUL x y) z)
+ // cond:
+ // result: (FMSUB x y z)
+ for {
+ v_0 := v.Args[0]
+ if v_0.Op != OpPPC64FMUL {
+ break
+ }
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ z := v.Args[1]
+ v.reset(OpPPC64FMSUB)
+ v.AddArg(x)
+ v.AddArg(y)
+ v.AddArg(z)
+ return true
+ }
+ return false
+}
+func rewriteValuePPC64_OpPPC64FSUBS(v *Value) bool {
+ // match: (FSUBS (FMULS x y) z)
+ // cond:
+ // result: (FMSUBS x y z)
+ for {
+ v_0 := v.Args[0]
+ if v_0.Op != OpPPC64FMULS {
+ break
+ }
+ x := v_0.Args[0]
+ y := v_0.Args[1]
+ z := v.Args[1]
+ v.reset(OpPPC64FMSUBS)
+ v.AddArg(x)
+ v.AddArg(y)
+ v.AddArg(z)
+ return true
+ }
+ return false
+}
func rewriteValuePPC64_OpPPC64GreaterEqual(v *Value) bool {
// match: (GreaterEqual (FlagEQ))
// cond:
func rewriteValuePPC64_OpRound32F(v *Value) bool {
// match: (Round32F x)
// cond:
- // result: x
+ // result: (LoweredRound32F x)
for {
x := v.Args[0]
- v.reset(OpCopy)
- v.Type = x.Type
+ v.reset(OpPPC64LoweredRound32F)
v.AddArg(x)
return true
}
func rewriteValuePPC64_OpRound64F(v *Value) bool {
// match: (Round64F x)
// cond:
- // result: x
+ // result: (LoweredRound64F x)
for {
x := v.Args[0]
- v.reset(OpCopy)
- v.Type = x.Type
+ v.reset(OpPPC64LoweredRound64F)
v.AddArg(x)
return true
}
}
o1 = AOP_RRR(oprrr(ctxt, p.As), uint32(p.To.Reg), 0, uint32(r))
- case 34: /* FMADDx fra,frb,frc,frd (d=a*b+c); FSELx a<0? (d=b): (d=c) */
+ case 34: /* FMADDx fra,frb,frc,frt (t=a*c±b) */
o1 = AOP_RRR(oprrr(ctxt, p.As), uint32(p.To.Reg), uint32(p.From.Reg), uint32(p.Reg)) | (uint32(p.From3.Reg)&31)<<6
case 35: /* mov r,lext/lauto/loreg ==> cau $(v>>16),sb,r'; store o(r') */