]> Cypherpunks repositories - gostls13.git/commitdiff
cmd/compile: add prefetch intrinsic support on loong64
authorGuoqi Chen <chenguoqi@loongson.cn>
Mon, 12 May 2025 03:01:44 +0000 (11:01 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Mon, 19 May 2025 07:27:10 +0000 (00:27 -0700)
This CL enables intrinsic support to emit the following prefetch
instructions for loong64 platform:
  1.Prefetch - prefetches data from memory address to cache;
  2.PrefetchStreamed - prefetches data from memory address, with a
    hint that this data is being streamed.

Benchmarks picked from go/test/bench/garbage
Parameters tested with:
GOMAXPROCS=8
tree2 -heapsize=1000000000 -cpus=8
tree -n=18
parser
peano

Benchmarks Loongson-3A6000-HV @ 2500.00MHz:
         |   bench.old   |              bench.new               |
         |    sec/op     |    sec/op      vs base               |
Tree2-8    1238.2µ ± 24%   999.9µ ± 453%       ~ (p=0.089 n=10)
Tree-8      277.4m ±  1%   275.5m ±   1%       ~ (p=0.063 n=10)
Parser-8     3.564 ±  0%    3.509 ±   1%  -1.56% (p=0.000 n=10)
Peano-8     39.12m ±  2%   38.85m ±   2%       ~ (p=0.353 n=10)
geomean     83.19m         78.28m         -5.90%

Change-Id: I59e9aa4f609a106d4f70706e6d6d1fe6738ab72a
Reviewed-on: https://go-review.googlesource.com/c/go/+/671876
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
src/cmd/compile/internal/loong64/ssa.go
src/cmd/compile/internal/ssa/_gen/LOONG64.rules
src/cmd/compile/internal/ssa/_gen/LOONG64Ops.go
src/cmd/compile/internal/ssa/opGen.go
src/cmd/compile/internal/ssa/rewriteLOONG64.go
src/cmd/compile/internal/ssagen/intrinsics.go
src/cmd/compile/internal/ssagen/intrinsics_test.go

index 60516d6618bbb27da9ade2fbaf2cd33761577405..d60aef165cc31e98122b6bdb003de846e940ad0a 100644 (file)
@@ -948,6 +948,24 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) {
                p.Reg = v.Args[0].Reg()
                p.To.Type = obj.TYPE_REG
                p.To.Reg = v.Reg()
+
+       case ssa.OpLOONG64PRELD:
+               // PRELD (Rarg0), hint
+               p := s.Prog(v.Op.Asm())
+               p.From.Type = obj.TYPE_MEM
+               p.From.Reg = v.Args[0].Reg()
+               p.AddRestSourceConst(v.AuxInt & 0x1f)
+
+       case ssa.OpLOONG64PRELDX:
+               // PRELDX (Rarg0), $n, $hint
+               p := s.Prog(v.Op.Asm())
+               p.From.Type = obj.TYPE_MEM
+               p.From.Reg = v.Args[0].Reg()
+               p.AddRestSourceArgs([]obj.Addr{
+                       {Type: obj.TYPE_CONST, Offset: int64((v.AuxInt >> 5) & 0x1fffffffff)},
+                       {Type: obj.TYPE_CONST, Offset: int64((v.AuxInt >> 0) & 0x1f)},
+               })
+
        case ssa.OpClobber, ssa.OpClobberReg:
                // TODO: implement for clobberdead experiment. Nop is ok for now.
        default:
index 100bb232d4842b4495b65789bce551326cc71780..bec8493b997ff9744c4e23942ee56cc1a5e4a5ad 100644 (file)
 
 (MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x)
 
+// Prefetch instructions (hint specified using aux field)
+// For PRELD{,X} A value of hint indicates:
+//    hint=0 is defined as load prefetch to L1-cache
+//    hint=2 is defined as load prefetch to L3-cache
+// The PrefetchCacheStreamed implementation prefetches 512 bytes of data
+// into L3. The aux field are defined as follows:
+//    bit[4:0]:
+//       $hint parameter of PRELDX instruction
+//    bit[41:5]:
+//       $n parameter of PRELDX instruction, bit[0] of $n is the address
+//       sequence, bits[11:1] is the block size, bits[20:12] is the block
+//       num, bits[36:21] is the stride, for more details about $n, refer
+//       to src/cmd/internal/obj/loong64/doc.go
+(PrefetchCache addr mem)         => (PRELD  addr mem [0])
+(PrefetchCacheStreamed addr mem) => (PRELDX addr mem [(((512 << 1) + (1 << 12)) << 5) + 2])
+
 // constant comparisons
 (SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1])
 (SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0])
index c68a24ca97d03a9297997c52f1ff93e424b8b9bb..dbfbcf1fd014ee18f103241dacbd58b86a579ec4 100644 (file)
@@ -148,6 +148,7 @@ func init() {
                gpstore2  = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
                gpxchg    = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
                gpcas     = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+               preldreg  = regInfo{inputs: []regMask{gpspg}}
                fp01      = regInfo{inputs: nil, outputs: []regMask{fp}}
                fp11      = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
                fp21      = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
@@ -566,6 +567,14 @@ func init() {
                {name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
                {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
                {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+
+               // Prefetch instruction
+               // Do prefetch arg0 address with option aux. arg0=addr, arg1=memory, aux=option.
+               // Note:
+               //   The aux of PRELDX is actually composed of two values: $hint and $n. bit[4:0]
+               //   is $hint and bit[41:5] is $n.
+               {name: "PRELD", argLength: 2, aux: "Int64", reg: preldreg, asm: "PRELD", hasSideEffects: true},
+               {name: "PRELDX", argLength: 2, aux: "Int64", reg: preldreg, asm: "PRELDX", hasSideEffects: true},
        }
 
        blocks := []blockData{
index 7b36344f073689e59a616cd256194f3982f1bb55..615aca3ba637935ceb3b0b4133aa602ea1b0f95e 100644 (file)
@@ -1962,6 +1962,8 @@ const (
        OpLOONG64LoweredPanicBoundsA
        OpLOONG64LoweredPanicBoundsB
        OpLOONG64LoweredPanicBoundsC
+       OpLOONG64PRELD
+       OpLOONG64PRELDX
 
        OpMIPSADD
        OpMIPSADDconst
@@ -26452,6 +26454,30 @@ var opcodeTable = [...]opInfo{
                        },
                },
        },
+       {
+               name:           "PRELD",
+               auxType:        auxInt64,
+               argLen:         2,
+               hasSideEffects: true,
+               asm:            loong64.APRELD,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1073741820}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+                       },
+               },
+       },
+       {
+               name:           "PRELDX",
+               auxType:        auxInt64,
+               argLen:         2,
+               hasSideEffects: true,
+               asm:            loong64.APRELDX,
+               reg: regInfo{
+                       inputs: []inputInfo{
+                               {0, 1073741820}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+                       },
+               },
+       },
 
        {
                name:        "ADD",
index fcff307c6556407f59ccd72bd510e2cf777848d9..f6575a81810f53e17c1cc36fcbe33d8e233f3d7a 100644 (file)
@@ -674,6 +674,10 @@ func rewriteValueLOONG64(v *Value) bool {
                return rewriteValueLOONG64_OpPopCount32(v)
        case OpPopCount64:
                return rewriteValueLOONG64_OpPopCount64(v)
+       case OpPrefetchCache:
+               return rewriteValueLOONG64_OpPrefetchCache(v)
+       case OpPrefetchCacheStreamed:
+               return rewriteValueLOONG64_OpPrefetchCacheStreamed(v)
        case OpPubBarrier:
                v.Op = OpLOONG64LoweredPubBarrier
                return true
@@ -9078,6 +9082,34 @@ func rewriteValueLOONG64_OpPopCount64(v *Value) bool {
                return true
        }
 }
+func rewriteValueLOONG64_OpPrefetchCache(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (PrefetchCache addr mem)
+       // result: (PRELD addr mem [0])
+       for {
+               addr := v_0
+               mem := v_1
+               v.reset(OpLOONG64PRELD)
+               v.AuxInt = int64ToAuxInt(0)
+               v.AddArg2(addr, mem)
+               return true
+       }
+}
+func rewriteValueLOONG64_OpPrefetchCacheStreamed(v *Value) bool {
+       v_1 := v.Args[1]
+       v_0 := v.Args[0]
+       // match: (PrefetchCacheStreamed addr mem)
+       // result: (PRELDX addr mem [(((512 << 1) + (1 << 12)) << 5) + 2])
+       for {
+               addr := v_0
+               mem := v_1
+               v.reset(OpLOONG64PRELDX)
+               v.AuxInt = int64ToAuxInt((((512 << 1) + (1 << 12)) << 5) + 2)
+               v.AddArg2(addr, mem)
+               return true
+       }
+}
 func rewriteValueLOONG64_OpRotateLeft16(v *Value) bool {
        v_1 := v.Args[1]
        v_0 := v.Args[0]
index 78350723da2bf76370ed91739cdc4b6713689048..97798f5bcc4b51500459351c319fd04a448a3889 100644 (file)
@@ -234,9 +234,9 @@ func initIntrinsics(cfg *intrinsicBuildConfig) {
        // Make Prefetch intrinsics for supported platforms
        // On the unsupported platforms stub function will be eliminated
        addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
-               sys.AMD64, sys.ARM64, sys.PPC64)
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
        addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
-               sys.AMD64, sys.ARM64, sys.PPC64)
+               sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
 
        /******** internal/runtime/atomic ********/
        type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
index 5d3b0519b7609b40b10b52807b6945c7439f4b20..6757e1e802928e8c97a8bcd7cc845a3de9506e17 100644 (file)
@@ -418,6 +418,8 @@ var wantIntrinsics = map[testIntrinsicKey]struct{}{
        {"loong64", "internal/runtime/sys", "Len64"}:                       struct{}{},
        {"loong64", "internal/runtime/sys", "Len8"}:                        struct{}{},
        {"loong64", "internal/runtime/sys", "OnesCount64"}:                 struct{}{},
+       {"loong64", "internal/runtime/sys", "Prefetch"}:                    struct{}{},
+       {"loong64", "internal/runtime/sys", "PrefetchStreamed"}:            struct{}{},
        {"loong64", "internal/runtime/sys", "TrailingZeros32"}:             struct{}{},
        {"loong64", "internal/runtime/sys", "TrailingZeros64"}:             struct{}{},
        {"loong64", "internal/runtime/sys", "TrailingZeros8"}:              struct{}{},