From 816ff44479ae1f1e9459221f63206e93f6f12824 Mon Sep 17 00:00:00 2001 From: Lynn Boger Date: Fri, 4 Oct 2019 16:03:36 -0400 Subject: [PATCH] cmd/compile: use vsx loads and stores for LoweredMove, LoweredZero on ppc64x MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This improves the code generated for LoweredMove and LoweredZero by using LXVD2X and STXVD2X to move 16 bytes at a time. These instructions are now used if the size to be moved or zeroed is >= 64. These same instructions have already been used in the asm implementations for memmove and memclr. Some examples where this shows an improvement on power8: MakeSlice/Byte 27.3ns ± 1% 25.2ns ± 0% -7.69% MakeSlice/Int16 40.2ns ± 0% 35.2ns ± 0% -12.39% MakeSlice/Int 94.9ns ± 1% 77.9ns ± 0% -17.92% MakeSlice/Ptr 129ns ± 1% 103ns ± 0% -20.16% MakeSlice/Struct/24 176ns ± 1% 131ns ± 0% -25.67% MakeSlice/Struct/32 200ns ± 1% 142ns ± 0% -29.09% MakeSlice/Struct/40 220ns ± 2% 156ns ± 0% -28.82% GrowSlice/Byte 81.4ns ± 0% 73.4ns ± 0% -9.88% GrowSlice/Int16 118ns ± 1% 98ns ± 0% -17.03% GrowSlice/Int 178ns ± 1% 134ns ± 1% -24.65% GrowSlice/Ptr 249ns ± 4% 212ns ± 0% -14.94% GrowSlice/Struct/24 294ns ± 5% 215ns ± 0% -27.08% GrowSlice/Struct/32 315ns ± 1% 248ns ± 0% -21.49% GrowSlice/Struct/40 382ns ± 4% 289ns ± 1% -24.38% ExtendSlice/IntSlice 109ns ± 1% 90ns ± 1% -17.51% ExtendSlice/PointerSlice 142ns ± 2% 118ns ± 0% -16.75% ExtendSlice/NoGrow 6.02ns ± 0% 5.88ns ± 0% -2.33% Append 27.2ns ± 0% 27.6ns ± 0% +1.38% AppendGrowByte 4.20ms ± 3% 2.60ms ± 0% -38.18% AppendGrowString 134ms ± 3% 102ms ± 2% -23.62% AppendSlice/1Bytes 5.65ns ± 0% 5.67ns ± 0% +0.35% AppendSlice/4Bytes 6.40ns ± 0% 6.55ns ± 0% +2.34% AppendSlice/7Bytes 8.74ns ± 0% 8.84ns ± 0% +1.14% AppendSlice/8Bytes 5.68ns ± 0% 5.70ns ± 0% +0.40% AppendSlice/15Bytes 9.31ns ± 0% 9.39ns ± 0% +0.86% AppendSlice/16Bytes 14.0ns ± 0% 5.8ns ± 0% -58.32% AppendSlice/32Bytes 5.72ns ± 0% 5.68ns ± 0% -0.66% AppendSliceLarge/1024Bytes 918ns ± 8% 615ns ± 1% -33.00% AppendSliceLarge/4096Bytes 3.25µs ± 1% 1.92µs ± 1% -40.84% AppendSliceLarge/16384Bytes 8.70µs ± 2% 4.69µs ± 0% -46.08% AppendSliceLarge/65536Bytes 18.1µs ± 3% 7.9µs ± 0% -56.30% AppendSliceLarge/262144Bytes 69.8µs ± 2% 25.9µs ± 0% -62.91% AppendSliceLarge/1048576Bytes 258µs ± 1% 93µs ± 0% -63.96% Change-Id: I21625dbe231a2029ddb9f7d73f5a6417b35c1e49 Reviewed-on: https://go-review.googlesource.com/c/go/+/199639 Run-TryBot: Lynn Boger TryBot-Result: Gobot Gobot Reviewed-by: Cherry Zhang --- src/cmd/compile/internal/ppc64/ssa.go | 218 +++++++++++++------ src/cmd/compile/internal/ssa/gen/PPC64Ops.go | 57 ++--- src/cmd/compile/internal/ssa/opGen.go | 2 +- 3 files changed, 182 insertions(+), 95 deletions(-) diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go index 69847c38d2..4f852b883a 100644 --- a/src/cmd/compile/internal/ppc64/ssa.go +++ b/src/cmd/compile/internal/ppc64/ssa.go @@ -855,13 +855,13 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // for sizes >= 64 generate a loop as follows: // set up loop counter in CTR, used by BC + // XXLXOR VS32,VS32,VS32 // MOVD len/32,REG_TMP // MOVD REG_TMP,CTR + // MOVD $16,REG_TMP // loop: - // MOVD R0,(R3) - // MOVD R0,8(R3) - // MOVD R0,16(R3) - // MOVD R0,24(R3) + // STXVD2X VS32,(R0)(R3) + // STXVD2X VS32,(R31)(R3) // ADD $32,R3 // BC 16, 0, loop // @@ -895,8 +895,16 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // only generate a loop if there is more // than 1 iteration. if ctr > 1 { + // Set up VS32 (V0) to hold 0s + p := s.Prog(ppc64.AXXLXOR) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + p.Reg = ppc64.REG_VS32 + // Set up CTR loop counter - p := s.Prog(ppc64.AMOVD) + p = s.Prog(ppc64.AMOVD) p.From.Type = obj.TYPE_CONST p.From.Offset = ctr p.To.Type = obj.TYPE_REG @@ -908,23 +916,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REG_CTR - // generate 4 MOVDs + // Set up R31 to hold index value 16 + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + // generate 2 STXVD2Xs to store 16 bytes // when this is a loop then the top must be saved var top *obj.Prog - for offset := int64(0); offset < 32; offset += 8 { - // This is the top of loop - p := s.Prog(ppc64.AMOVD) - p.From.Type = obj.TYPE_REG - p.From.Reg = ppc64.REG_R0 - p.To.Type = obj.TYPE_MEM - p.To.Reg = v.Args[0].Reg() - p.To.Offset = offset - // Save the top of loop - if top == nil { - top = p - } + // This is the top of loop + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Index = ppc64.REGZERO + // Save the top of loop + if top == nil { + top = p } + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = v.Args[0].Reg() + p.To.Index = ppc64.REGTMP + // Increment address for the // 4 doublewords just zeroed. p = s.Prog(ppc64.AADD) @@ -994,30 +1014,27 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // When moving >= 64 bytes a loop is used // MOVD len/32,REG_TMP // MOVD REG_TMP,CTR + // MOVD $16,REG_TMP // top: - // MOVD (R4),R7 - // MOVD 8(R4),R8 - // MOVD 16(R4),R9 - // MOVD 24(R4),R10 - // ADD R4,$32 - // MOVD R7,(R3) - // MOVD R8,8(R3) - // MOVD R9,16(R3) - // MOVD R10,24(R3) - // ADD R3,$32 + // LXVD2X (R0)(R4),VS32 + // LXVD2X (R31)(R4),VS33 + // ADD $32,R4 + // STXVD2X VS32,(R0)(R3) + // STXVD2X VS33,(R31)(R4) + // ADD $32,R3 // BC 16,0,top // Bytes not moved by this loop are moved // with a combination of the following instructions, // starting with the largest sizes and generating as // many as needed, using the appropriate offset value. - // MOVD n(R4),R7 - // MOVD R7,n(R3) - // MOVW n1(R4),R7 - // MOVW R7,n1(R3) - // MOVH n2(R4),R7 - // MOVH R7,n2(R3) - // MOVB n3(R4),R7 - // MOVB R7,n3(R3) + // MOVD n(R4),R14 + // MOVD R14,n(R3) + // MOVW n1(R4),R14 + // MOVW R14,n1(R3) + // MOVH n2(R4),R14 + // MOVH R14,n2(R3) + // MOVB n3(R4),R14 + // MOVB R14,n3(R3) // Each loop iteration moves 32 bytes ctr := v.AuxInt / 32 @@ -1030,7 +1047,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // The set of registers used here, must match the clobbered reg list // in PPC64Ops.go. - useregs := []int16{ppc64.REG_R7, ppc64.REG_R8, ppc64.REG_R9, ppc64.REG_R10} offset := int64(0) // top of the loop @@ -1050,22 +1066,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = ppc64.REG_CTR - // Generate all the MOVDs for loads - // based off the same register, increasing - // the offset by 8 for each instruction - for _, rg := range useregs { - p := s.Prog(ppc64.AMOVD) - p.From.Type = obj.TYPE_MEM - p.From.Reg = src_reg - p.From.Offset = offset - p.To.Type = obj.TYPE_REG - p.To.Reg = rg - if top == nil { - top = p - } - offset += 8 + // Use REGTMP as index reg + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + // Generate 16 byte loads and stores. + // Use temp register for index (16) + // on the second one. + p = s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = src_reg + p.From.Index = ppc64.REGZERO + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + if top == nil { + top = p } - // increment the src_reg for next iteration + + p = s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = src_reg + p.From.Index = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS33 + + // increment the src reg for next iteration p = s.Prog(ppc64.AADD) p.Reg = src_reg p.From.Type = obj.TYPE_CONST @@ -1073,20 +1102,22 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_REG p.To.Reg = src_reg - // generate the MOVDs for stores, based - // off the same register, using the same - // offsets as in the loads. - offset = int64(0) - for _, rg := range useregs { - p := s.Prog(ppc64.AMOVD) - p.From.Type = obj.TYPE_REG - p.From.Reg = rg - p.To.Type = obj.TYPE_MEM - p.To.Reg = dst_reg - p.To.Offset = offset - offset += 8 - } - // increment the dst_reg for next iteration + // generate 16 byte stores + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dst_reg + p.To.Index = ppc64.REGZERO + + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS33 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dst_reg + p.To.Index = ppc64.REGTMP + + // increment the dst reg for next iteration p = s.Prog(ppc64.AADD) p.Reg = dst_reg p.From.Type = obj.TYPE_CONST @@ -1114,6 +1145,57 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { rem += 32 } + if rem >= 16 { + // Generate 16 byte loads and stores. + // Use temp register for index (value 16) + // on the second one. + p := s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = src_reg + p.From.Index = ppc64.REGZERO + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dst_reg + p.To.Index = ppc64.REGZERO + + offset = 16 + rem -= 16 + + if rem >= 16 { + // Use REGTMP as index reg + p = s.Prog(ppc64.AMOVD) + p.From.Type = obj.TYPE_CONST + p.From.Offset = 16 + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REGTMP + + // Generate 16 byte loads and stores. + // Use temp register for index (16) + // on the second one. + p = s.Prog(ppc64.ALXVD2X) + p.From.Type = obj.TYPE_MEM + p.From.Reg = src_reg + p.From.Index = ppc64.REGTMP + p.To.Type = obj.TYPE_REG + p.To.Reg = ppc64.REG_VS32 + + p = s.Prog(ppc64.ASTXVD2X) + p.From.Type = obj.TYPE_REG + p.From.Reg = ppc64.REG_VS32 + p.To.Type = obj.TYPE_MEM + p.To.Reg = dst_reg + p.To.Index = ppc64.REGTMP + + offset = 32 + rem -= 16 + } + } + // Generate all the remaining load and store pairs, starting with // as many 8 byte moves as possible, then 4, 2, 1. for rem > 0 { @@ -1129,7 +1211,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // Load p := s.Prog(op) p.To.Type = obj.TYPE_REG - p.To.Reg = ppc64.REG_R7 + p.To.Reg = ppc64.REG_R14 p.From.Type = obj.TYPE_MEM p.From.Reg = src_reg p.From.Offset = offset @@ -1137,7 +1219,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { // Store p = s.Prog(op) p.From.Type = obj.TYPE_REG - p.From.Reg = ppc64.REG_R7 + p.From.Reg = ppc64.REG_R14 p.To.Type = obj.TYPE_MEM p.To.Reg = dst_reg p.To.Offset = offset diff --git a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go index 5505db5222..a6bcc26543 100644 --- a/src/cmd/compile/internal/ssa/gen/PPC64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/PPC64Ops.go @@ -416,13 +416,13 @@ func init() { // a loop is generated when there is more than one iteration // needed to clear 4 doublewords // + // XXLXOR VS32,VS32,VS32 // MOVD $len/32,R31 // MOVD R31,CTR + // MOVD $16,R31 // loop: - // MOVD R0,(R3) - // MOVD R0,8(R3) - // MOVD R0,16(R3) - // MOVD R0,24(R3) + // STXVD2X VS32,(R0)(R3) + // STXVD2X VS32,(R31),R3) // ADD R3,32 // BC loop @@ -448,33 +448,38 @@ func init() { typ: "Mem", faultOnNilArg0: true, }, + // R31 is temp register // Loop code: - // MOVD len/32,REG_TMP only for loop - // MOVD REG_TMP,CTR only for loop + // MOVD len/32,R31 set up loop ctr + // MOVD R31,CTR + // MOVD $16,R31 index register // loop: - // MOVD (R4),R7 - // MOVD 8(R4),R8 - // MOVD 16(R4),R9 - // MOVD 24(R4),R10 - // ADD R4,$32 only with loop - // MOVD R7,(R3) - // MOVD R8,8(R3) - // MOVD R9,16(R3) - // MOVD R10,24(R3) - // ADD R3,$32 only with loop - // BC 16,0,loop only with loop + // LXVD2X (R0)(R4),VS32 + // LXVD2X (R31)(R4),VS33 + // ADD R4,$32 increment src + // STXVD2X VS32,(R0)(R3) + // STXVD2X VS33,(R31)(R3) + // ADD R3,$32 increment dst + // BC 16,0,loop branch ctr + // For this purpose, VS32 and VS33 are treated as + // scratch registers. Since regalloc does not + // track vector registers, even if it could be marked + // as clobbered it would have no effect. + // TODO: If vector registers are managed by regalloc + // mark these as clobbered. + // // Bytes not moved by this loop are moved // with a combination of the following instructions, // starting with the largest sizes and generating as // many as needed, using the appropriate offset value. - // MOVD n(R4),R7 - // MOVD R7,n(R3) - // MOVW n1(R4),R7 - // MOVW R7,n1(R3) - // MOVH n2(R4),R7 - // MOVH R7,n2(R3) - // MOVB n3(R4),R7 - // MOVB R7,n3(R3) + // MOVD n(R4),R14 + // MOVD R14,n(R3) + // MOVW n1(R4),R14 + // MOVW R14,n1(R3) + // MOVH n2(R4),R14 + // MOVH R14,n2(R3) + // MOVB n3(R4),R14 + // MOVB R14,n3(R3) { name: "LoweredMove", @@ -482,7 +487,7 @@ func init() { argLength: 3, reg: regInfo{ inputs: []regMask{buildReg("R3"), buildReg("R4")}, - clobbers: buildReg("R3 R4 R7 R8 R9 R10"), + clobbers: buildReg("R3 R4 R14"), }, clobberFlags: true, typ: "Mem", diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index 795c6bbdf7..c30654dda7 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -24486,7 +24486,7 @@ var opcodeTable = [...]opInfo{ {0, 8}, // R3 {1, 16}, // R4 }, - clobbers: 1944, // R3 R4 R7 R8 R9 R10 + clobbers: 16408, // R3 R4 R14 }, }, { -- 2.50.0