]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile,cmd/internal/obj/ppc64: use mulli where possible
authorLynn Boger <laboger@linux.vnet.ibm.com>
Fri, 2 Oct 2020 21:51:13 +0000 (17:51 -0400)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Tue, 6 Oct 2020 19:40:46 +0000 (19:40 +0000)
This adds support to allow the use of mulli when one of the multiply
operands is a constant that fits in 16 bits.

This especially helps in the case where this instruction appears in
a loop since the load of the constant is not being moved out of the loop.

Some improvements seen in compress/flate on power9:

Decode/Digits/Huffman/1e4         259µs ± 0%     261µs ± 0%   +0.57%  (p=1.000 n=1+1)
Decode/Digits/Huffman/1e5        2.43ms ± 0%    2.45ms ± 0%   +0.79%  (p=1.000 n=1+1)
Decode/Digits/Huffman/1e6        23.9ms ± 0%    24.2ms ± 0%   +0.86%  (p=1.000 n=1+1)
Decode/Digits/Speed/1e4           278µs ± 0%     279µs ± 0%   +0.34%  (p=1.000 n=1+1)
Decode/Digits/Speed/1e5          2.80ms ± 0%    2.81ms ± 0%   +0.29%  (p=1.000 n=1+1)
Decode/Digits/Speed/1e6          28.0ms ± 0%    28.1ms ± 0%   +0.28%  (p=1.000 n=1+1)
Decode/Digits/Default/1e4         278µs ± 0%     278µs ± 0%   +0.28%  (p=1.000 n=1+1)
Decode/Digits/Default/1e5        2.68ms ± 0%    2.69ms ± 0%   +0.19%  (p=1.000 n=1+1)
Decode/Digits/Default/1e6        26.6ms ± 0%    26.6ms ± 0%   +0.21%  (p=1.000 n=1+1)
Decode/Digits/Compression/1e4     278µs ± 0%     278µs ± 0%   +0.00%  (p=1.000 n=1+1)
Decode/Digits/Compression/1e5    2.68ms ± 0%    2.69ms ± 0%   +0.21%  (p=1.000 n=1+1)
Decode/Digits/Compression/1e6    26.6ms ± 0%    26.6ms ± 0%   +0.07%  (p=1.000 n=1+1)
Decode/Newton/Huffman/1e4         322µs ± 0%     312µs ± 0%   -2.84%  (p=1.000 n=1+1)
Decode/Newton/Huffman/1e5        3.11ms ± 0%    2.91ms ± 0%   -6.41%  (p=1.000 n=1+1)
Decode/Newton/Huffman/1e6        31.4ms ± 0%    29.3ms ± 0%   -6.85%  (p=1.000 n=1+1)
Decode/Newton/Speed/1e4           282µs ± 0%     269µs ± 0%   -4.69%  (p=1.000 n=1+1)
Decode/Newton/Speed/1e5          2.29ms ± 0%    2.20ms ± 0%   -4.13%  (p=1.000 n=1+1)
Decode/Newton/Speed/1e6          22.7ms ± 0%    21.3ms ± 0%   -6.06%  (p=1.000 n=1+1)
Decode/Newton/Default/1e4         254µs ± 0%     237µs ± 0%   -6.60%  (p=1.000 n=1+1)
Decode/Newton/Default/1e5        1.86ms ± 0%    1.75ms ± 0%   -5.99%  (p=1.000 n=1+1)
Decode/Newton/Default/1e6        18.1ms ± 0%    17.4ms ± 0%   -4.10%  (p=1.000 n=1+1)
Decode/Newton/Compression/1e4     254µs ± 0%     244µs ± 0%   -3.91%  (p=1.000 n=1+1)
Decode/Newton/Compression/1e5    1.85ms ± 0%    1.79ms ± 0%   -3.10%  (p=1.000 n=1+1)
Decode/Newton/Compression/1e6    18.0ms ± 0%    17.3ms ± 0%   -3.88%  (p=1.000 n=1+1)

Change-Id: I840320fab1c4bf64c76b001c2651ab79f23df4eb
Reviewed-on: https://go-review.googlesource.com/c/go/+/259444
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Carlos Eduardo Seo <carlos.seo@gmail.com>
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>

src/cmd/asm/internal/asm/testdata/ppc64enc.s
src/cmd/compile/internal/gc/bench_test.go
src/cmd/compile/internal/ppc64/ssa.go
src/cmd/compile/internal/ssa/gen/PPC64.rules
src/cmd/compile/internal/ssa/gen/PPC64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewritePPC64.go
src/cmd/internal/obj/ppc64/asm9.go

index 869f8c2d4f8050329f4015549e168caa0ea67634..c6d7b59aad6656d14dc64e6c94a323359ee7d914 100644 (file)
@@ -204,12 +204,16 @@ TEXT asmtest(SB),DUPOK|NOSPLIT,$0
 
        MULLW R3, R4                    // 7c8419d6
        MULLW R3, R4, R5                // 7ca419d6
+       MULLW $10, R3                   // 1c63000a
+       MULLW $10000000, R3             // 641f009863ff96807c7f19d6
        MULLWCC R3, R4, R5              // 7ca419d7
        MULHW R3, R4, R5                // 7ca41896
 
        MULHWU R3, R4, R5               // 7ca41816
        MULLD R3, R4                    // 7c8419d2
        MULLD R4, R4, R5                // 7ca421d2
+       MULLD $20, R4                   // 1c840014
+       MULLD $200000000, R4            // 641f0beb63ffc2007c9f21d2
        MULLDCC R3, R4, R5              // 7ca419d3
        MULHD R3, R4, R5                // 7ca41892
        MULHDCC R3, R4, R5              // 7ca41893
index a2887f2f7bec7d9fae4de1737ed1c925ba618065..8c4288128f25b00bc41e67d8db6a9a43da9a5320 100644 (file)
@@ -7,6 +7,7 @@ package gc
 import "testing"
 
 var globl int64
+var globl32 int32
 
 func BenchmarkLoadAdd(b *testing.B) {
        x := make([]int64, 1024)
@@ -42,6 +43,17 @@ func BenchmarkModify(b *testing.B) {
        }
 }
 
+func BenchmarkMullImm(b *testing.B) {
+       x := make([]int32, 1024)
+       for i := 0; i < b.N; i++ {
+               var s int32
+               for i := range x {
+                       s += x[i] * 100
+               }
+               globl32 = s
+       }
+}
+
 func BenchmarkConstModify(b *testing.B) {
        a := make([]int64, 1024)
        for i := 0; i < b.N; i++ {
index d83b2df379e9d6779eeab1b222bd6c50c636a706..1ece4d999fcb4b67f4a0a49fead2fd6050a3dcc4 100644 (file)
@@ -677,7 +677,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
                p.From.Reg = v.Args[0].Reg()
 
        case ssa.OpPPC64ADDconst, ssa.OpPPC64ANDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst,
-               ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst, ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst:
+               ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst,
+               ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst, ssa.OpPPC64MULLWconst, ssa.OpPPC64MULLDconst:
                p := s.Prog(v.Op.Asm())
                p.Reg = v.Args[0].Reg()
                p.From.Type = obj.TYPE_CONST
index 83ee4c499bed0343433a113d810bf57af1269682..a05cfee654127233852192b628363e919bd9a00d 100644 (file)
 
 (ADDconst [c] (MOVDaddr [d] {sym} x)) && is32Bit(c+int64(d)) => (MOVDaddr [int32(c+int64(d))] {sym} x)
 
+(MULL(W|D) x (MOVDconst [c])) && is16Bit(c) => (MULL(W|D)const [int32(c)] x)
+
 // Subtract from (with carry, but ignored) constant.
 // Note, these clobber the carry bit.
 (SUB (MOVDconst [c]) x) && is32Bit(c) => (SUBFCconst [c] x)
index 28317928a85ef0f3d8cf22b3cee6b81211033e85..5885660597418d449771f5b50b32e349532638a4 100644 (file)
@@ -181,6 +181,8 @@ func init() {
 
                {name: "MULLD", argLength: 2, reg: gp21, asm: "MULLD", typ: "Int64", commutative: true}, // arg0*arg1 (signed 64-bit)
                {name: "MULLW", argLength: 2, reg: gp21, asm: "MULLW", typ: "Int32", commutative: true}, // arg0*arg1 (signed 32-bit)
+               {name: "MULLDconst", argLength: 1, reg: gp11, asm: "MULLD", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
+               {name: "MULLWconst", argLength: 1, reg: gp11, asm: "MULLW", aux: "Int32", typ: "Int64"}, // arg0*auxInt (signed 64-bit)
                {name: "MADDLD", argLength: 3, reg: gp31, asm: "MADDLD", typ: "Int64"},                  // (arg0*arg1)+arg2 (signed 64-bit)
 
                {name: "MULHD", argLength: 2, reg: gp21, asm: "MULHD", commutative: true},   // (arg0 * arg1) >> 64, signed
index d7d2b24a4840a7a4e31ffcdcc5da515499929993..051550fb17160d7ad394a20055eaa85b335ce39a 100644 (file)
@@ -1832,6 +1832,8 @@ const (
        OpPPC64FSUBS
        OpPPC64MULLD
        OpPPC64MULLW
+       OpPPC64MULLDconst
+       OpPPC64MULLWconst
        OpPPC64MADDLD
        OpPPC64MULHD
        OpPPC64MULHW
@@ -24377,6 +24379,34 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:    "MULLDconst",
+               auxType: auxInt32,
+               argLen:  1,
+               asm:     ppc64.AMULLD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+                       outputs: []outputInfo{
+                               {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+               },
+       },
+       {
+               name:    "MULLWconst",
+               auxType: auxInt32,
+               argLen:  1,
+               asm:     ppc64.AMULLW,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+                       outputs: []outputInfo{
+                               {0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
+                       },
+               },
+       },
        {
                name:   "MADDLD",
                argLen: 3,
index 9822637b0583aff3b8e5336984917597f95cbb5c..1b8a5a78ca27e5c2eff8b1cfe05e9829a69cf2a2 100644 (file)
@@ -568,6 +568,10 @@ func rewriteValuePPC64(v *Value) bool {
                return rewriteValuePPC64_OpPPC64MOVWstorezero(v)
        case OpPPC64MTVSRD:
                return rewriteValuePPC64_OpPPC64MTVSRD(v)
+       case OpPPC64MULLD:
+               return rewriteValuePPC64_OpPPC64MULLD(v)
+       case OpPPC64MULLW:
+               return rewriteValuePPC64_OpPPC64MULLW(v)
        case OpPPC64NEG:
                return rewriteValuePPC64_OpPPC64NEG(v)
        case OpPPC64NOR:
@@ -11003,6 +11007,56 @@ func rewriteValuePPC64_OpPPC64MTVSRD(v *Value) bool {
        }
        return false
 }
+func rewriteValuePPC64_OpPPC64MULLD(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (MULLD x (MOVDconst [c]))
+       // cond: is16Bit(c)
+       // result: (MULLDconst [int32(c)] x)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       if v_1.Op != OpPPC64MOVDconst {
+                               continue
+                       }
+                       c := auxIntToInt64(v_1.AuxInt)
+                       if !(is16Bit(c)) {
+                               continue
+                       }
+                       v.reset(OpPPC64MULLDconst)
+                       v.AuxInt = int32ToAuxInt(int32(c))
+                       v.AddArg(x)
+                       return true
+               }
+               break
+       }
+       return false
+}
+func rewriteValuePPC64_OpPPC64MULLW(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (MULLW x (MOVDconst [c]))
+       // cond: is16Bit(c)
+       // result: (MULLWconst [int32(c)] x)
+       for {
+               for _i0 := 0; _i0 <= 1; _i0, v_0, v_1 = _i0+1, v_1, v_0 {
+                       x := v_0
+                       if v_1.Op != OpPPC64MOVDconst {
+                               continue
+                       }
+                       c := auxIntToInt64(v_1.AuxInt)
+                       if !(is16Bit(c)) {
+                               continue
+                       }
+                       v.reset(OpPPC64MULLWconst)
+                       v.AuxInt = int32ToAuxInt(int32(c))
+                       v.AddArg(x)
+                       return true
+               }
+               break
+       }
+       return false
+}
 func rewriteValuePPC64_OpPPC64NEG(v *Value) bool {
        v_0 := v.Args[0]
        // match: (NEG (ADDconst [c] x))
index 928e299f43e2577c7c3c338967c70cccdf19d35b..c2e8e9e9d09263963ff2f0456c416d3a7f6bf62d 100644 (file)
@@ -1279,6 +1279,9 @@ func buildop(ctxt *obj.Link) {
                case AREMD:
                        opset(AREMDU, r0)
 
+               case AMULLW:
+                       opset(AMULLD, r0)
+
                case ADIVW: /* op Rb[,Ra],Rd */
                        opset(AMULHW, r0)
 
@@ -1312,7 +1315,6 @@ func buildop(ctxt *obj.Link) {
                        opset(AMULHDCC, r0)
                        opset(AMULHDU, r0)
                        opset(AMULHDUCC, r0)
-                       opset(AMULLD, r0)
                        opset(AMULLDCC, r0)
                        opset(AMULLDVCC, r0)
                        opset(AMULLDV, r0)
@@ -1996,7 +1998,6 @@ func buildop(ctxt *obj.Link) {
                        AMOVB,  /* macro: move byte with sign extension */
                        AMOVBU, /* macro: move byte with sign extension & update */
                        AMOVFL,
-                       AMULLW,
                        /* op $s[,r2],r3; op r1[,r2],r3; no cc/v */
                        ASUBC, /* op r1,$s,r3; op r1[,r2],r3 */
                        ASTSW,
@@ -4990,8 +4991,8 @@ func (c *ctxt9) opirr(a obj.As) uint32 {
        case ADARN:
                return OPVCC(31, 755, 0, 0) /* darn - v3.00 */
 
-       case AMULLW:
-               return OPVCC(7, 0, 0, 0)
+       case AMULLW, AMULLD:
+               return OPVCC(7, 0, 0, 0) /* mulli works with MULLW or MULLD */
 
        case AOR:
                return OPVCC(24, 0, 0, 0)