The following performance improvements have been made to the
low-level atomic functions for ppc64le & ppc64:
- For those cases containing a lwarx and stwcx (or other sizes):
sync, lwarx, maybe something, stwcx, loop to sync, sync, isync
The sync is moved before (outside) the lwarx/stwcx loop, and the
sync after is removed, so it becomes:
sync, lwarx, maybe something, stwcx, loop to lwarx, isync
- For the Or8 and And8, the shifting and manipulation of the
address to the word aligned version were removed and the
instructions were changed to use lbarx, stbcx instead of
register shifting, xor, then lwarx, stwcx.
- New instructions LWSYNC, LBAR, STBCC were tested and added.
runtime/atomic_ppc64x.s was changed to use the LWSYNC opcode
instead of the WORD encoding.
Fixes #15469
Ran some of the benchmarks in the runtime and sync directories.
Some results varied from run to run but the trend was improvement
based on best times for base and new:
runtime.test:
BenchmarkChanNonblocking-128 0.88 0.89 +1.14%
BenchmarkChanUncontended-128 569 511 -10.19%
BenchmarkChanContended-128 63110 53231 -15.65%
BenchmarkChanSync-128 691 598 -13.46%
BenchmarkChanSyncWork-128 11355 11649 +2.59%
BenchmarkChanProdCons0-128 2402 2090 -12.99%
BenchmarkChanProdCons10-128 1348 1363 +1.11%
BenchmarkChanProdCons100-128 1002 746 -25.55%
BenchmarkChanProdConsWork0-128 2554 2720 +6.50%
BenchmarkChanProdConsWork10-128 1909 1804 -5.50%
BenchmarkChanProdConsWork100-128 1624 1580 -2.71%
BenchmarkChanCreation-128 237 212 -10.55%
BenchmarkChanSem-128 705 667 -5.39%
BenchmarkChanPopular-128
5081190 4497566 -11.49%
BenchmarkCreateGoroutines-128 532 473 -11.09%
BenchmarkCreateGoroutinesParallel-128 35.0 34.7 -0.86%
BenchmarkCreateGoroutinesCapture-128 4923 4200 -14.69%
sync.test:
BenchmarkUncontendedSemaphore-128 112 94.2 -15.89%
BenchmarkContendedSemaphore-128 133 128 -3.76%
BenchmarkMutexUncontended-128 1.90 1.67 -12.11%
BenchmarkMutex-128 353 310 -12.18%
BenchmarkMutexSlack-128 304 283 -6.91%
BenchmarkMutexWork-128 554 541 -2.35%
BenchmarkMutexWorkSlack-128 567 556 -1.94%
BenchmarkMutexNoSpin-128 275 242 -12.00%
BenchmarkMutexSpin-128 1129 1030 -8.77%
BenchmarkOnce-128 1.08 0.96 -11.11%
BenchmarkPool-128 29.8 27.4 -8.05%
BenchmarkPoolOverflow-128 40564 36583 -9.81%
BenchmarkSemaUncontended-128 3.14 2.63 -16.24%
BenchmarkSemaSyntNonblock-128 1087 1069 -1.66%
BenchmarkSemaSyntBlock-128 897 893 -0.45%
BenchmarkSemaWorkNonblock-128 1034 1028 -0.58%
BenchmarkSemaWorkBlock-128 949 886 -6.64%
Change-Id: I4403fb29d3cd5254b7b1ce87a216bd11b391079e
Reviewed-on: https://go-review.googlesource.com/22549
Reviewed-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Minux Ma <minux@golang.org>
AFSUBS
AFSUBSCC
AMOVMW
+ ALBAR
ALSW
ALWAR
+ ALWSYNC
AMOVWBR
AMOVB
AMOVBU
ASRAW
ASRAWCC
ASRWCC
+ ASTBCCC
ASTSW
ASTWCCC
ASUB
"FSUBS",
"FSUBSCC",
"MOVMW",
+ "LBAR",
"LSW",
"LWAR",
+ "LWSYNC",
"MOVWBR",
"MOVB",
"MOVBU",
"SRAW",
"SRAWCC",
"SRWCC",
+ "STBCCC",
"STSW",
"STWCCC",
"SUB",
case AECOWX: /* indexed store: op s,(b+a); op s,(b) */
opset(ASTWCCC, r0)
+ opset(ASTBCCC, r0)
opset(ASTDCCC, r0)
case ASYNC:
opset(AISYNC, r0)
+ opset(ALWSYNC, r0)
opset(APTESYNC, r0)
opset(ATLBSYNC, r0)
opset(AFMOVSU, r0)
case AECIWX:
+ opset(ALBAR, r0)
opset(ALWAR, r0)
opset(ALDAR, r0)
case ASYNC:
return OPVCC(31, 598, 0, 0)
+ case ALWSYNC:
+ return OPVCC(31, 598, 0, 0) | 1<<21
+
case APTESYNC:
return OPVCC(31, 598, 0, 0) | 2<<21
return OPVCC(31, 311, 0, 0) /* lhzux */
case AECIWX:
return OPVCC(31, 310, 0, 0) /* eciwx */
+ case ALBAR:
+ return OPVCC(31, 52, 0, 0) /* lbarx */
case ALWAR:
return OPVCC(31, 20, 0, 0) /* lwarx */
case ALDAR:
return OPVCC(31, 661, 0, 0) /* stswx */
case AMOVWBR:
return OPVCC(31, 662, 0, 0) /* stwbrx */
+ case ASTBCCC:
+ return OPVCC(31, 694, 0, 1) /* stbcx. */
case ASTWCCC:
return OPVCC(31, 150, 0, 1) /* stwcx. */
case ASTDCCC:
}
case ALWAR,
+ ALBAR,
+ ASTBCCC,
ASTWCCC,
AECIWX,
AECOWX,
ASYNC,
ATLBSYNC,
APTESYNC,
+ ALWSYNC,
ATW,
AWORD,
ARFI,
// LWSYNC is the "export" barrier recommended by Power ISA
// v2.07 book II, appendix B.2.2.2.
// LWSYNC is a load/load, load/store, and store/store barrier.
- WORD $0x7c2004ac // LWSYNC
+ LWSYNC
RET
MOVD ptr+0(FP), R3
MOVWZ old+8(FP), R4
MOVWZ new+12(FP), R5
-cas_again:
SYNC
+cas_again:
LWAR (R3), R6
CMPW R6, R4
BNE cas_fail
STWCCC R5, (R3)
BNE cas_again
MOVD $1, R3
- SYNC
ISYNC
MOVB R3, ret+16(FP)
RET
cas_fail:
- MOVD $0, R3
- BR -5(PC)
+ MOVB R0, ret+16(FP)
+ RET
// bool runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
// Atomically:
MOVD ptr+0(FP), R3
MOVD old+8(FP), R4
MOVD new+16(FP), R5
-cas64_again:
SYNC
+cas64_again:
LDAR (R3), R6
CMP R6, R4
BNE cas64_fail
STDCCC R5, (R3)
BNE cas64_again
MOVD $1, R3
- SYNC
ISYNC
MOVB R3, ret+24(FP)
RET
cas64_fail:
- MOVD $0, R3
- BR -5(PC)
+ MOVB R0, ret+24(FP)
+ RET
TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
BR runtime∕internal∕atomic·Cas64(SB)
LWAR (R4), R3
ADD R5, R3
STWCCC R3, (R4)
- BNE -4(PC)
- SYNC
+ BNE -3(PC)
ISYNC
MOVW R3, ret+16(FP)
RET
LDAR (R4), R3
ADD R5, R3
STDCCC R3, (R4)
- BNE -4(PC)
- SYNC
+ BNE -3(PC)
ISYNC
MOVD R3, ret+16(FP)
RET
SYNC
LWAR (R4), R3
STWCCC R5, (R4)
- BNE -3(PC)
- SYNC
+ BNE -2(PC)
ISYNC
MOVW R3, ret+16(FP)
RET
SYNC
LDAR (R4), R3
STDCCC R5, (R4)
- BNE -3(PC)
- SYNC
+ BNE -2(PC)
ISYNC
MOVD R3, ret+16(FP)
RET
TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
- // Align ptr down to 4 bytes so we can use 32-bit load/store.
- // R5 = (R3 << 0) & ~3
- RLDCR $0, R3, $~3, R5
- // Compute val shift.
-#ifdef GOARCH_ppc64
- // Big endian. ptr = ptr ^ 3
- XOR $3, R3
-#endif
- // R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
- RLDC $3, R3, $(3*8), R6
- // Shift val for aligned ptr. R4 = val << R6
- SLD R6, R4, R4
-
-again:
SYNC
- LWAR (R5), R6
+again:
+ LBAR (R3), R6
OR R4, R6
- STWCCC R6, (R5)
+ STBCCC R6, (R3)
BNE again
- SYNC
ISYNC
RET
TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
- // Align ptr down to 4 bytes so we can use 32-bit load/store.
- // R5 = (R3 << 0) & ~3
- RLDCR $0, R3, $~3, R5
- // Compute val shift.
-#ifdef GOARCH_ppc64
- // Big endian. ptr = ptr ^ 3
- XOR $3, R3
-#endif
- // R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
- RLDC $3, R3, $(3*8), R6
- // Shift val for aligned ptr. R4 = val << R6 | ^(0xFF << R6)
- MOVD $0xFF, R7
- SLD R6, R4
- SLD R6, R7
- XOR $-1, R7
- OR R7, R4
-again:
SYNC
- LWAR (R5), R6
+again:
+ LBAR (R3), R6
AND R4, R6
- STWCCC R6, (R5)
+ STBCCC R6, (R3)
BNE again
- SYNC
ISYNC
RET
SYNC
LWAR (R3), R5
STWCCC R4, (R3)
- BNE -3(PC)
- SYNC
+ BNE -2(PC)
ISYNC
MOVW R5, old+16(FP)
RET
SYNC
LDAR (R3), R5
STDCCC R4, (R3)
- BNE -3(PC)
- SYNC
+ BNE -2(PC)
ISYNC
MOVD R5, old+16(FP)
RET
SYNC
LWAR (R3), R6
CMPW R6, R4
- BNE 8(PC)
+ BNE 7(PC)
STWCCC R5, (R3)
- BNE -5(PC)
- SYNC
+ BNE -4(PC)
ISYNC
MOVD $1, R3
MOVB R3, swapped+16(FP)
SYNC
LDAR (R3), R6
CMP R6, R4
- BNE 8(PC)
+ BNE 7(PC)
STDCCC R5, (R3)
- BNE -5(PC)
- SYNC
+ BNE -4(PC)
ISYNC
MOVD $1, R3
MOVB R3, swapped+24(FP)
LWAR (R3), R5
ADD R4, R5
STWCCC R5, (R3)
- BNE -4(PC)
- SYNC
+ BNE -3(PC)
ISYNC
MOVW R5, ret+16(FP)
RET
LDAR (R3), R5
ADD R4, R5
STDCCC R5, (R3)
- BNE -4(PC)
- SYNC
+ BNE -3(PC)
ISYNC
MOVD R5, ret+16(FP)
RET