math.RoundToEven can be done by one arm64 instruction FRINTND, intrinsify it to improve performance.
The current pure Go implementation of the function Abs is translated into five instructions on arm64:
str, ldr, and, str, ldr. The intrinsic implementation requires only one instruction, so in terms of
performance, intrinsify it is worthwhile.
Benchmarks:
name old time/op new time/op delta
Abs-8 3.50ns ± 0% 1.50ns ± 0% -57.14% (p=0.000 n=10+10)
RoundToEven-8 9.26ns ± 0% 1.50ns ± 0% -83.80% (p=0.000 n=10+10)
Change-Id: I9456b26ab282b544dfac0154fc86f17aed96ac3d
Reviewed-on: https://go-review.googlesource.com/116535
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
fallthrough
case ssa.OpARM64MVN,
ssa.OpARM64NEG,
+ ssa.OpARM64FABSD,
ssa.OpARM64FMOVDfpgp,
ssa.OpARM64FMOVDgpfp,
ssa.OpARM64FNEGS,
ssa.OpARM64CLZW,
ssa.OpARM64FRINTAD,
ssa.OpARM64FRINTMD,
+ ssa.OpARM64FRINTND,
ssa.OpARM64FRINTPD,
ssa.OpARM64FRINTZD:
p := s.Prog(v.Op.Asm())
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpRoundToEven, types.Types[TFLOAT64], args[0])
},
- sys.S390X)
+ sys.ARM64, sys.S390X)
addF("math", "Abs",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpAbs, types.Types[TFLOAT64], args[0])
},
- sys.PPC64)
+ sys.ARM64, sys.PPC64)
addF("math", "Copysign",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpCopysign, types.Types[TFLOAT64], args[0], args[1])
(Com8 x) -> (MVN x)
// math package intrinsics
+(Abs x) -> (FABSD x)
(Sqrt x) -> (FSQRTD x)
(Ceil x) -> (FRINTPD x)
(Floor x) -> (FRINTMD x)
(Round x) -> (FRINTAD x)
+(RoundToEven x) -> (FRINTND x)
(Trunc x) -> (FRINTZD x)
// lowering rotates
// unary ops
{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0
+ {name: "FABSD", argLength: 1, reg: fp11, asm: "FABSD"}, // abs(arg0), float64
{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"}, // -arg0, float32
{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"}, // -arg0, float64
{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64
// floating-point round to integral
{name: "FRINTAD", argLength: 1, reg: fp11, asm: "FRINTAD"},
{name: "FRINTMD", argLength: 1, reg: fp11, asm: "FRINTMD"},
+ {name: "FRINTND", argLength: 1, reg: fp11, asm: "FRINTND"},
{name: "FRINTPD", argLength: 1, reg: fp11, asm: "FRINTPD"},
{name: "FRINTZD", argLength: 1, reg: fp11, asm: "FRINTZD"},
OpARM64LoweredMuluhilo
OpARM64MVN
OpARM64NEG
+ OpARM64FABSD
OpARM64FNEGS
OpARM64FNEGD
OpARM64FSQRTD
OpARM64FCVTDS
OpARM64FRINTAD
OpARM64FRINTMD
+ OpARM64FRINTND
OpARM64FRINTPD
OpARM64FRINTZD
OpARM64CSEL
},
},
},
+ {
+ name: "FABSD",
+ argLen: 1,
+ asm: arm64.AFABSD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
{
name: "FNEGS",
argLen: 1,
},
},
},
+ {
+ name: "FRINTND",
+ argLen: 1,
+ asm: arm64.AFRINTND,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ outputs: []outputInfo{
+ {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31
+ },
+ },
+ },
{
name: "FRINTPD",
argLen: 1,
return rewriteValueARM64_OpARM64XORshiftRA_0(v)
case OpARM64XORshiftRL:
return rewriteValueARM64_OpARM64XORshiftRL_0(v)
+ case OpAbs:
+ return rewriteValueARM64_OpAbs_0(v)
case OpAdd16:
return rewriteValueARM64_OpAdd16_0(v)
case OpAdd32:
return rewriteValueARM64_OpRound32F_0(v)
case OpRound64F:
return rewriteValueARM64_OpRound64F_0(v)
+ case OpRoundToEven:
+ return rewriteValueARM64_OpRoundToEven_0(v)
case OpRsh16Ux16:
return rewriteValueARM64_OpRsh16Ux16_0(v)
case OpRsh16Ux32:
}
return false
}
+func rewriteValueARM64_OpAbs_0(v *Value) bool {
+ // match: (Abs x)
+ // cond:
+ // result: (FABSD x)
+ for {
+ x := v.Args[0]
+ v.reset(OpARM64FABSD)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM64_OpAdd16_0(v *Value) bool {
// match: (Add16 x y)
// cond:
return true
}
}
+func rewriteValueARM64_OpRoundToEven_0(v *Value) bool {
+ // match: (RoundToEven x)
+ // cond:
+ // result: (FRINTND x)
+ for {
+ x := v.Args[0]
+ v.reset(OpARM64FRINTND)
+ v.AddArg(x)
+ return true
+ }
+}
func rewriteValueARM64_OpRsh16Ux16_0(v *Value) bool {
b := v.Block
_ = b
sink64[3] = math.Trunc(x)
// s390x:"FIDBR\t[$]4"
+ // arm64:"FRINTND"
sink64[4] = math.RoundToEven(x)
}
// Check that it's using integer registers
func abs(x, y float64) {
// amd64:"BTRQ\t[$]63"
+ // arm64:"FABSD\t"
// s390x:"LPDFR\t",-"MOVD\t" (no integer load/store)
// ppc64le:"FABS\t"
sink64[0] = math.Abs(x)