This CL enables intrinsic support to emit the following prefetch
instructions for loong64 platform:
1.Prefetch - prefetches data from memory address to cache;
2.PrefetchStreamed - prefetches data from memory address, with a
hint that this data is being streamed.
Benchmarks picked from go/test/bench/garbage
Parameters tested with:
GOMAXPROCS=8
tree2 -heapsize=
1000000000 -cpus=8
tree -n=18
parser
peano
Benchmarks Loongson-3A6000-HV @ 2500.00MHz:
| bench.old | bench.new |
| sec/op | sec/op vs base |
Tree2-8 1238.2µ ± 24% 999.9µ ± 453% ~ (p=0.089 n=10)
Tree-8 277.4m ± 1% 275.5m ± 1% ~ (p=0.063 n=10)
Parser-8 3.564 ± 0% 3.509 ± 1% -1.56% (p=0.000 n=10)
Peano-8 39.12m ± 2% 38.85m ± 2% ~ (p=0.353 n=10)
geomean 83.19m 78.28m -5.90%
Change-Id: I59e9aa4f609a106d4f70706e6d6d1fe6738ab72a
Reviewed-on: https://go-review.googlesource.com/c/go/+/671876
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Meidan Li <limeidan@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
p.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
+
+ case ssa.OpLOONG64PRELD:
+ // PRELD (Rarg0), hint
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = v.Args[0].Reg()
+ p.AddRestSourceConst(v.AuxInt & 0x1f)
+
+ case ssa.OpLOONG64PRELDX:
+ // PRELDX (Rarg0), $n, $hint
+ p := s.Prog(v.Op.Asm())
+ p.From.Type = obj.TYPE_MEM
+ p.From.Reg = v.Args[0].Reg()
+ p.AddRestSourceArgs([]obj.Addr{
+ {Type: obj.TYPE_CONST, Offset: int64((v.AuxInt >> 5) & 0x1fffffffff)},
+ {Type: obj.TYPE_CONST, Offset: int64((v.AuxInt >> 0) & 0x1f)},
+ })
+
case ssa.OpClobber, ssa.OpClobberReg:
// TODO: implement for clobberdead experiment. Nop is ok for now.
default:
(MOVBUreg (ANDconst [c] x)) => (ANDconst [c&0xff] x)
+// Prefetch instructions (hint specified using aux field)
+// For PRELD{,X} A value of hint indicates:
+// hint=0 is defined as load prefetch to L1-cache
+// hint=2 is defined as load prefetch to L3-cache
+// The PrefetchCacheStreamed implementation prefetches 512 bytes of data
+// into L3. The aux field are defined as follows:
+// bit[4:0]:
+// $hint parameter of PRELDX instruction
+// bit[41:5]:
+// $n parameter of PRELDX instruction, bit[0] of $n is the address
+// sequence, bits[11:1] is the block size, bits[20:12] is the block
+// num, bits[36:21] is the stride, for more details about $n, refer
+// to src/cmd/internal/obj/loong64/doc.go
+(PrefetchCache addr mem) => (PRELD addr mem [0])
+(PrefetchCacheStreamed addr mem) => (PRELDX addr mem [(((512 << 1) + (1 << 12)) << 5) + 2])
+
// constant comparisons
(SGTconst [c] (MOVVconst [d])) && c>d => (MOVVconst [1])
(SGTconst [c] (MOVVconst [d])) && c<=d => (MOVVconst [0])
gpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
+ preldreg = regInfo{inputs: []regMask{gpspg}}
fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
fp11 = regInfo{inputs: []regMask{fp}, outputs: []regMask{fp}}
fp21 = regInfo{inputs: []regMask{fp, fp}, outputs: []regMask{fp}}
{name: "LoweredPanicBoundsA", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r3, r4}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
{name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r2, r3}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
{name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in genericOps.go).
+
+ // Prefetch instruction
+ // Do prefetch arg0 address with option aux. arg0=addr, arg1=memory, aux=option.
+ // Note:
+ // The aux of PRELDX is actually composed of two values: $hint and $n. bit[4:0]
+ // is $hint and bit[41:5] is $n.
+ {name: "PRELD", argLength: 2, aux: "Int64", reg: preldreg, asm: "PRELD", hasSideEffects: true},
+ {name: "PRELDX", argLength: 2, aux: "Int64", reg: preldreg, asm: "PRELDX", hasSideEffects: true},
}
blocks := []blockData{
OpLOONG64LoweredPanicBoundsA
OpLOONG64LoweredPanicBoundsB
OpLOONG64LoweredPanicBoundsC
+ OpLOONG64PRELD
+ OpLOONG64PRELDX
OpMIPSADD
OpMIPSADDconst
},
},
},
+ {
+ name: "PRELD",
+ auxType: auxInt64,
+ argLen: 2,
+ hasSideEffects: true,
+ asm: loong64.APRELD,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073741820}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+ },
+ },
+ },
+ {
+ name: "PRELDX",
+ auxType: auxInt64,
+ argLen: 2,
+ hasSideEffects: true,
+ asm: loong64.APRELDX,
+ reg: regInfo{
+ inputs: []inputInfo{
+ {0, 1073741820}, // SP R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R18 R19 R20 R21 g R23 R24 R25 R26 R27 R28 R29 R31
+ },
+ },
+ },
{
name: "ADD",
return rewriteValueLOONG64_OpPopCount32(v)
case OpPopCount64:
return rewriteValueLOONG64_OpPopCount64(v)
+ case OpPrefetchCache:
+ return rewriteValueLOONG64_OpPrefetchCache(v)
+ case OpPrefetchCacheStreamed:
+ return rewriteValueLOONG64_OpPrefetchCacheStreamed(v)
case OpPubBarrier:
v.Op = OpLOONG64LoweredPubBarrier
return true
return true
}
}
+func rewriteValueLOONG64_OpPrefetchCache(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (PrefetchCache addr mem)
+ // result: (PRELD addr mem [0])
+ for {
+ addr := v_0
+ mem := v_1
+ v.reset(OpLOONG64PRELD)
+ v.AuxInt = int64ToAuxInt(0)
+ v.AddArg2(addr, mem)
+ return true
+ }
+}
+func rewriteValueLOONG64_OpPrefetchCacheStreamed(v *Value) bool {
+ v_1 := v.Args[1]
+ v_0 := v.Args[0]
+ // match: (PrefetchCacheStreamed addr mem)
+ // result: (PRELDX addr mem [(((512 << 1) + (1 << 12)) << 5) + 2])
+ for {
+ addr := v_0
+ mem := v_1
+ v.reset(OpLOONG64PRELDX)
+ v.AuxInt = int64ToAuxInt((((512 << 1) + (1 << 12)) << 5) + 2)
+ v.AddArg2(addr, mem)
+ return true
+ }
+}
func rewriteValueLOONG64_OpRotateLeft16(v *Value) bool {
v_1 := v.Args[1]
v_0 := v.Args[0]
// Make Prefetch intrinsics for supported platforms
// On the unsupported platforms stub function will be eliminated
addF("internal/runtime/sys", "Prefetch", makePrefetchFunc(ssa.OpPrefetchCache),
- sys.AMD64, sys.ARM64, sys.PPC64)
+ sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
addF("internal/runtime/sys", "PrefetchStreamed", makePrefetchFunc(ssa.OpPrefetchCacheStreamed),
- sys.AMD64, sys.ARM64, sys.PPC64)
+ sys.AMD64, sys.ARM64, sys.Loong64, sys.PPC64)
/******** internal/runtime/atomic ********/
type atomicOpEmitter func(s *state, n *ir.CallExpr, args []*ssa.Value, op ssa.Op, typ types.Kind, needReturn bool)
{"loong64", "internal/runtime/sys", "Len64"}: struct{}{},
{"loong64", "internal/runtime/sys", "Len8"}: struct{}{},
{"loong64", "internal/runtime/sys", "OnesCount64"}: struct{}{},
+ {"loong64", "internal/runtime/sys", "Prefetch"}: struct{}{},
+ {"loong64", "internal/runtime/sys", "PrefetchStreamed"}: struct{}{},
{"loong64", "internal/runtime/sys", "TrailingZeros32"}: struct{}{},
{"loong64", "internal/runtime/sys", "TrailingZeros64"}: struct{}{},
{"loong64", "internal/runtime/sys", "TrailingZeros8"}: struct{}{},