From: fanzha02 Date: Thu, 12 Jul 2018 03:23:21 +0000 (+0000) Subject: cmd/internal/obj/arm64: encode float constants into FMOVS/FMOVD instructions X-Git-Tag: go1.12beta1~1156 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=c430adf1362dbe4c150cba98214fd521d0e9933a;p=gostls13.git cmd/internal/obj/arm64: encode float constants into FMOVS/FMOVD instructions Current assembler rewrites float constants to values stored in memory except 0.0, which is not performant. This patch uses the FMOVS/FMOVD instructions to move some available floating-point immediate constants into SIMD&FP destination registers. These available constants can be encoded into FMOVS/FMOVD instructions, checked by the chipfloat7() function. go1 benchmark results. name old time/op new time/op delta BinaryTree17-8 6.27s ± 1% 6.27s ± 1% ~ (p=0.762 n=10+8) Fannkuch11-8 5.42s ± 1% 5.38s ± 0% -0.63% (p=0.000 n=10+10) FmtFprintfEmpty-8 92.9ns ± 1% 93.4ns ± 0% +0.47% (p=0.004 n=9+8) FmtFprintfString-8 169ns ± 2% 170ns ± 4% ~ (p=0.378 n=10+10) FmtFprintfInt-8 197ns ± 1% 196ns ± 1% -0.77% (p=0.009 n=10+9) FmtFprintfIntInt-8 284ns ± 1% 286ns ± 1% ~ (p=0.051 n=10+10) FmtFprintfPrefixedInt-8 419ns ± 0% 422ns ± 1% +0.69% (p=0.038 n=6+10) FmtFprintfFloat-8 458ns ± 0% 463ns ± 1% +1.14% (p=0.000 n=10+10) FmtManyArgs-8 1.35µs ± 2% 1.36µs ± 1% +0.91% (p=0.043 n=10+10) GobDecode-8 16.0ms ± 2% 15.5ms ± 1% -3.39% (p=0.000 n=10+10) GobEncode-8 11.9ms ± 3% 11.4ms ± 1% -3.98% (p=0.000 n=10+9) Gzip-8 621ms ± 0% 625ms ± 0% +0.59% (p=0.000 n=9+10) Gunzip-8 74.0ms ± 1% 74.3ms ± 0% ~ (p=0.059 n=9+8) HTTPClientServer-8 116µs ± 1% 116µs ± 1% ~ (p=0.165 n=10+10) JSONEncode-8 29.3ms ± 1% 29.5ms ± 0% +0.72% (p=0.001 n=10+10) JSONDecode-8 145ms ± 1% 148ms ± 2% +2.06% (p=0.000 n=10+10) Mandelbrot200-8 9.67ms ± 0% 9.48ms ± 1% -1.92% (p=0.000 n=8+10) GoParse-8 7.55ms ± 0% 7.60ms ± 0% +0.57% (p=0.000 n=9+10) RegexpMatchEasy0_32-8 234ns ± 0% 210ns ± 0% -10.13% (p=0.000 n=8+10) RegexpMatchEasy0_1K-8 753ns ± 1% 729ns ± 0% -3.17% (p=0.000 n=10+8) RegexpMatchEasy1_32-8 225ns ± 0% 224ns ± 0% -0.44% (p=0.000 n=9+9) RegexpMatchEasy1_1K-8 1.03µs ± 0% 1.04µs ± 1% +1.29% (p=0.000 n=10+10) RegexpMatchMedium_32-8 320ns ± 3% 296ns ± 6% -7.50% (p=0.000 n=10+10) RegexpMatchMedium_1K-8 77.0µs ± 5% 73.6µs ± 1% ~ (p=0.393 n=10+10) RegexpMatchHard_32-8 3.93µs ± 0% 3.89µs ± 1% -0.95% (p=0.000 n=10+9) RegexpMatchHard_1K-8 120µs ± 5% 115µs ± 1% ~ (p=0.739 n=10+10) Revcomp-8 1.07s ± 0% 1.08s ± 1% +0.63% (p=0.000 n=10+9) Template-8 165ms ± 1% 163ms ± 1% -1.05% (p=0.001 n=8+10) TimeParse-8 751ns ± 1% 749ns ± 1% ~ (p=0.209 n=10+10) TimeFormat-8 759ns ± 1% 751ns ± 1% -0.96% (p=0.001 n=10+10) name old speed new speed delta GobDecode-8 48.0MB/s ± 2% 49.6MB/s ± 1% +3.50% (p=0.000 n=10+10) GobEncode-8 64.5MB/s ± 3% 67.1MB/s ± 1% +4.08% (p=0.000 n=10+9) Gzip-8 31.2MB/s ± 0% 31.1MB/s ± 0% -0.55% (p=0.000 n=9+8) Gunzip-8 262MB/s ± 1% 261MB/s ± 0% ~ (p=0.059 n=9+8) JSONEncode-8 66.3MB/s ± 1% 65.8MB/s ± 0% -0.72% (p=0.001 n=10+10) JSONDecode-8 13.4MB/s ± 1% 13.2MB/s ± 1% -2.02% (p=0.000 n=10+10) GoParse-8 7.67MB/s ± 0% 7.63MB/s ± 0% -0.57% (p=0.000 n=9+10) RegexpMatchEasy0_32-8 136MB/s ± 0% 152MB/s ± 0% +11.45% (p=0.000 n=10+10) RegexpMatchEasy0_1K-8 1.36GB/s ± 1% 1.40GB/s ± 0% +3.25% (p=0.000 n=10+8) RegexpMatchEasy1_32-8 142MB/s ± 0% 143MB/s ± 0% +0.35% (p=0.000 n=10+9) RegexpMatchEasy1_1K-8 992MB/s ± 0% 980MB/s ± 1% -1.27% (p=0.000 n=10+10) RegexpMatchMedium_32-8 3.12MB/s ± 3% 3.38MB/s ± 6% +8.17% (p=0.000 n=10+10) RegexpMatchMedium_1K-8 13.3MB/s ± 5% 13.9MB/s ± 1% ~ (p=0.362 n=10+10) RegexpMatchHard_32-8 8.14MB/s ± 0% 8.21MB/s ± 1% +0.95% (p=0.000 n=10+9) RegexpMatchHard_1K-8 8.54MB/s ± 5% 8.90MB/s ± 1% ~ (p=0.636 n=10+10) Revcomp-8 238MB/s ± 0% 236MB/s ± 1% -0.63% (p=0.000 n=10+9) Template-8 11.8MB/s ± 1% 11.9MB/s ± 1% +1.07% (p=0.001 n=8+10) Change-Id: I57b372d8dcd47e6aec39893843b20385d5d9c37e Reviewed-on: https://go-review.googlesource.com/129555 Run-TryBot: Cherry Zhang TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- diff --git a/src/cmd/asm/internal/asm/testdata/arm64.s b/src/cmd/asm/internal/asm/testdata/arm64.s index feb507db86..361b7a45c0 100644 --- a/src/cmd/asm/internal/asm/testdata/arm64.s +++ b/src/cmd/asm/internal/asm/testdata/arm64.s @@ -163,6 +163,12 @@ TEXT foo(SB), DUPOK|NOSPLIT, $-8 MOVB (R29)(R30<<0), R14 // ae7bbe38 MOVB (R29)(R30), R14 // MOVB (R29)(R30*1), R14 // ae6bbe38 MOVB R4, (R2)(R6.SXTX) // 44e82638 + FMOVS $(4.0), F0 // 0010221e + FMOVD $(4.0), F0 // 0010621e + FMOVS $(0.265625), F1 // 01302a1e + FMOVD $(0.1796875), F2 // 02f0681e + FMOVS $(0.96875), F3 // 03f02d1e + FMOVD $(28.0), F4 // 0490671e FMOVS (R2)(R6), F4 // FMOVS (R2)(R6*1), F4 // 446866bc FMOVS (R2)(R6<<2), F4 // 447866bc @@ -479,14 +485,14 @@ again: // { // outcode($1, &$2, NREG, &$4); // } - FADDD $0.5, F1 // FADDD $(0.5), F1 +// FADDD $0.5, F1 // FADDD $(0.5), F1 FADDD F1, F2 // LTYPEK frcon ',' freg ',' freg // { // outcode($1, &$2, $4.reg, &$6); // } - FADDD $0.7, F1, F2 // FADDD $(0.69999999999999996), F1, F2 +// FADDD $0.7, F1, F2 // FADDD $(0.69999999999999996), F1, F2 FADDD F1, F2, F3 // diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index 00232ccd55..7507976257 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -219,8 +219,6 @@ var optab = []Optab{ {AFADDS, C_FREG, C_NONE, C_NONE, C_FREG, 54, 4, 0, 0, 0}, {AFADDS, C_FREG, C_FREG, C_NONE, C_FREG, 54, 4, 0, 0, 0}, - {AFADDS, C_FCON, C_NONE, C_NONE, C_FREG, 54, 4, 0, 0, 0}, - {AFADDS, C_FCON, C_FREG, C_NONE, C_FREG, 54, 4, 0, 0, 0}, {AFMSUBD, C_FREG, C_FREG, C_FREG, C_FREG, 15, 4, 0, 0, 0}, {AFCMPS, C_FREG, C_FREG, C_NONE, C_NONE, 56, 4, 0, 0, 0}, {AFCMPS, C_FCON, C_FREG, C_NONE, C_NONE, 56, 4, 0, 0, 0}, @@ -340,9 +338,9 @@ var optab = []Optab{ {AFMOVS, C_ADDR, C_NONE, C_NONE, C_FREG, 65, 12, 0, 0, 0}, {AFMOVD, C_FREG, C_NONE, C_NONE, C_ADDR, 64, 12, 0, 0, 0}, {AFMOVD, C_ADDR, C_NONE, C_NONE, C_FREG, 65, 12, 0, 0, 0}, - {AFMOVS, C_FCON, C_NONE, C_NONE, C_FREG, 54, 4, 0, 0, 0}, + {AFMOVS, C_FCON, C_NONE, C_NONE, C_FREG, 55, 4, 0, 0, 0}, {AFMOVS, C_FREG, C_NONE, C_NONE, C_FREG, 54, 4, 0, 0, 0}, - {AFMOVD, C_FCON, C_NONE, C_NONE, C_FREG, 54, 4, 0, 0, 0}, + {AFMOVD, C_FCON, C_NONE, C_NONE, C_FREG, 55, 4, 0, 0, 0}, {AFMOVD, C_FREG, C_NONE, C_NONE, C_FREG, 54, 4, 0, 0, 0}, {AFMOVS, C_REG, C_NONE, C_NONE, C_FREG, 29, 4, 0, 0, 0}, {AFMOVS, C_FREG, C_NONE, C_NONE, C_REG, 29, 4, 0, 0, 0}, @@ -2461,6 +2459,9 @@ func buildop(ctxt *obj.Link) { } } +// chipfloat7() checks if the immediate constants available in FMOVS/FMOVD instructions. +// For details of the range of constants available, see +// http://infocenter.arm.com/help/topic/com.arm.doc.dui0473m/dom1359731199385.html. func (c *ctxt7) chipfloat7(e float64) int { ei := math.Float64bits(e) l := uint32(int32(ei)) @@ -3486,19 +3487,7 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { case 54: /* floating point arith */ o1 = c.oprrr(p, p.As) - - var rf int - if p.From.Type == obj.TYPE_CONST { - rf = c.chipfloat7(p.From.Val.(float64)) - if rf < 0 || true { - c.ctxt.Diag("invalid floating-point immediate\n%v", p) - rf = 0 - } - - rf |= (1 << 3) - } else { - rf = int(p.From.Reg) - } + rf := int(p.From.Reg) rt := int(p.To.Reg) r := int(p.Reg) if (o1&(0x1F<<24)) == (0x1E<<24) && (o1&(1<<11)) == 0 { /* monadic */ @@ -3509,6 +3498,18 @@ func (c *ctxt7) asmout(p *obj.Prog, o *Optab, out []uint32) { } o1 |= (uint32(rf&31) << 16) | (uint32(r&31) << 5) | uint32(rt&31) + case 55: /* floating-point constant */ + var rf int + o1 = 0xf<<25 | 1<<21 | 1<<12 + rf = c.chipfloat7(p.From.Val.(float64)) + if rf < 0 { + c.ctxt.Diag("invalid floating-point immediate\n%v", p) + } + if p.As == AFMOVD { + o1 |= 1 << 22 + } + o1 |= (uint32(rf&0xff) << 13) | uint32(p.To.Reg&31) + case 56: /* floating point compare */ o1 = c.oprrr(p, p.As) diff --git a/src/cmd/internal/obj/arm64/obj7.go b/src/cmd/internal/obj/arm64/obj7.go index 97b8f70c9b..4476dad071 100644 --- a/src/cmd/internal/obj/arm64/obj7.go +++ b/src/cmd/internal/obj/arm64/obj7.go @@ -254,7 +254,11 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { switch p.As { case AFMOVS: if p.From.Type == obj.TYPE_FCONST { - f32 := float32(p.From.Val.(float64)) + f64 := p.From.Val.(float64) + f32 := float32(f64) + if c.chipfloat7(f64) > 0 { + break + } if math.Float32bits(f32) == 0 { p.From.Type = obj.TYPE_REG p.From.Reg = REGZERO @@ -269,6 +273,9 @@ func progedit(ctxt *obj.Link, p *obj.Prog, newprog obj.ProgAlloc) { case AFMOVD: if p.From.Type == obj.TYPE_FCONST { f64 := p.From.Val.(float64) + if c.chipfloat7(f64) > 0 { + break + } if math.Float64bits(f64) == 0 { p.From.Type = obj.TYPE_REG p.From.Reg = REGZERO