From 1e28dce80ad2ec195d55269266c5cca7ebd845a5 Mon Sep 17 00:00:00 2001
From: Lynn Boger <laboger@linux.vnet.ibm.com>
Date: Thu, 13 Oct 2016 12:59:07 -0500
Subject: [PATCH] bytes: improve performance for bytes.Compare on ppc64x

This improves the performance for byte.Compare by rewriting
the cmpbody function in runtime/asm_ppc64x.s.  The previous code
had a simple loop which loaded a pair of bytes and compared them,
which is inefficient for long buffers.  The updated function checks
for 8 or 32 byte chunks and then loads and compares double words where
possible.

Because the byte.Compare result indicates greater or less than,
the doubleword loads must take endianness into account, using a
byte reversed load in the little endian case.

Fixes #17433

benchmark                                   old ns/op     new ns/op     delta
BenchmarkBytesCompare/8-16                  13.6          7.16          -47.35%
BenchmarkBytesCompare/16-16                 25.7          7.83          -69.53%
BenchmarkBytesCompare/32-16                 38.1          7.78          -79.58%
BenchmarkBytesCompare/64-16                 63.0          10.6          -83.17%
BenchmarkBytesCompare/128-16                112           13.0          -88.39%
BenchmarkBytesCompare/256-16                211           28.1          -86.68%
BenchmarkBytesCompare/512-16                410           38.6          -90.59%
BenchmarkBytesCompare/1024-16               807           60.2          -92.54%
BenchmarkBytesCompare/2048-16               1601          103           -93.57%

Change-Id: I121acc74fcd27c430797647b8d682eb0607c63eb
Reviewed-on: https://go-review.googlesource.com/30949
Reviewed-by: David Chase <drchase@google.com>
---
 src/cmd/internal/obj/ppc64/a.out.go  |   1 +
 src/cmd/internal/obj/ppc64/anames.go |   1 +
 src/cmd/internal/obj/ppc64/asm9.go   |   3 +
 src/runtime/asm_ppc64x.s             | 268 ++++++++++++++++++++++-----
 4 files changed, 225 insertions(+), 48 deletions(-)

diff --git a/src/cmd/internal/obj/ppc64/a.out.go b/src/cmd/internal/obj/ppc64/a.out.go
index 556ea37608..ac1126e8f2 100644
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -424,6 +424,7 @@ const (
 	ALSW
 	ALWAR
 	ALWSYNC
+	AMOVDBR
 	AMOVWBR
 	AMOVB
 	AMOVBU
diff --git a/src/cmd/internal/obj/ppc64/anames.go b/src/cmd/internal/obj/ppc64/anames.go
index 1d766a2d6b..9e26666dbe 100644
--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@@ -125,6 +125,7 @@ var Anames = []string{
 	"LSW",
 	"LWAR",
 	"LWSYNC",
+	"MOVDBR",
 	"MOVWBR",
 	"MOVB",
 	"MOVBU",
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go
index d36e32ff0d..7c9d83a7d1 100644
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -1513,6 +1513,7 @@ func buildop(ctxt *obj.Link) {
 
 		case AMOVHBR:
 			opset(AMOVWBR, r0)
+			opset(AMOVDBR, r0)
 
 		case ASLBMFEE:
 			opset(ASLBMFEV, r0)
@@ -3923,6 +3924,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 {
 		return OPVCC(31, 790, 0, 0) /* lhbrx */
 	case AMOVWBR:
 		return OPVCC(31, 534, 0, 0) /* lwbrx */
+	case AMOVDBR:
+		return OPVCC(31, 532, 0, 0) /* ldbrx */
 	case AMOVHZ:
 		return OPVCC(31, 279, 0, 0) /* lhzx */
 	case AMOVHZU:
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index b5cd12bb3c..8b5ea45082 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -812,13 +812,220 @@ eq:
 	MOVB	R3, ret+16(FP)
 	RET
 
-// Do an efficieint memequal for ppc64
-// for reuse where possible.
+// Do an efficient memcmp for ppc64le
+// R3 = s1 len
+// R4 = s2 len
+// R5 = s1 addr
+// R6 = s2 addr
+// R7 = addr of return value
+TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+
+        // Special processing for 32 bytes or longer.
+        // Loading this way is faster and correct as long as the
+	// doublewords being compared are equal. Once they
+	// are found unequal, reload them in proper byte order
+	// to determine greater or less than.
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	MOVD	$0,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R4,R15		// bytes match?
+	MOVD	$8,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD    24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	MOVD	$16,R16		// set up for cmpne
+	BNE	cmpne		// further compare for LT or GT
+	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU    R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BNE	cmpne
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVDBR	(R5+R0),R9	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// 1st len > 2nd len must be greater
+simple:
+	MOVBZ	0(R5), R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6), R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9, R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+	BR	less		// must be less
+cmpne:				// only here is not equal
+	MOVDBR	(R5+R16),R8	// reload in reverse order
+	MOVDBR	(R6+R16),R9
+	CMPU	R8,R9		// compare correct endianness
+	BGT	greater		// here only if NE
+less:
+	MOVD	$-1,R3
+	MOVD	R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD	$0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
+
+// Do an efficient memcmp for ppc64 (BE)
+// R3 = s1 len
+// R4 = s2 len
+// R5 = s1 addr
+// R6 = s2 addr
+// R7 = addr of return value
+TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R3,R8		// set up length
+	CMP	R3,R4,CR2	// unequal?
+	BC	12,8,setuplen	// BLT CR2
+	MOVD	R4,R8		// use R4 for comparison len
+setuplen:
+	MOVD	R8,CTR		// set up loop counter
+	CMP	R8,$8		// only optimize >=8
+	BLT	simplecheck
+	DCBT	(R5)		// cache hint
+	DCBT	(R6)
+	CMP	R8,$32		// optimize >= 32
+	MOVD	R8,R9
+	BLT	setup8a		// 8 byte moves only
+
+setup32a:
+	SRADCC	$5,R8,R9	// number of 32 byte chunks
+	MOVD	R9,CTR
+loop32a:
+	MOVD	0(R5),R9	// doublewords to compare
+	MOVD	0(R6),R10	// get 4 doublewords
+	MOVD	8(R5),R14
+	MOVD	8(R6),R15
+	CMPU	R9,R10		// bytes equal?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	MOVD	16(R5),R9	// get next pair of doublewords
+	MOVD	16(R6),R10
+	CMPU	R14,R15		// bytes match?
+	BLT	less		// found less
+	BGT	greater		// found greater
+	MOVD	24(R5),R14	// get next pair of doublewords
+	MOVD	24(R6),R15
+	CMPU	R9,R10		// bytes match?
+	BLT	less		// found to be less
+	BGT	greater		// found to be greater
+	ADD	$32,R5		// bump up to next 32
+	ADD	$32,R6
+	CMPU	R14,R15		// bytes match?
+	BC	8,2,loop32a	// br ctr and cr
+	BLT	less		// with BE, byte ordering is
+	BGT	greater		// good for compare
+	ANDCC	$24,R8,R9	// Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC	$3,R9,R9	// get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD	R9,CTR		// loop count for doublewords
+loop8:
+	MOVD	(R5),R9
+	MOVD	(R6),R10
+	ADD	$8,R5
+	ADD	$8,R6
+	CMPU	R9,R10		// match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BGT	greater
+	BLT	less
+leftover:
+	ANDCC	$7,R8,R9	// check for leftover bytes
+	MOVD	R9,CTR		// save the ctr
+	BNE	simple		// leftover bytes
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,8,less
+	BR	greater
+simplecheck:
+	CMP	R8,$0		// remaining compare length 0
+	BNE	simple		// do simple compare
+	BC	12,10,equal	// test CR2 for length comparison
+	BC 	12,8,less	// 1st len < 2nd len, result less
+	BR	greater		// same len, must be equal
+simple:
+	MOVBZ	0(R5),R9	// get byte from 1st operand
+	ADD	$1,R5
+	MOVBZ	0(R6),R10	// get byte from 2nd operand
+	ADD	$1,R6
+	CMPU	R9,R10
+	BC	8,2,simple	// bc ctr <> 0 && cr
+	BGT	greater		// 1st > 2nd
+	BLT	less		// 1st < 2nd
+	BC	12,10,equal	// test CR2 for length comparison
+	BC	12,9,greater	// 2nd len > 1st len
+less:
+	MOVD	$-1,R3
+	MOVD    R3,(R7)		// return value if A < B
+	RET
+equal:
+	MOVD    $0,(R7)		// return value if A == B
+	RET
+greater:
+	MOVD	$1,R3
+	MOVD	R3,(R7)		// return value if A > B
+	RET
+
+// Do an efficient memequal for ppc64
 // R3 = s1
 // R4 = s2
 // R5 = len
 // R9 = return value
-// R6, R7 clobbered
 TEXT runtimeÂ·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD    R5,CTR
 	CMP     R5,$8		// only optimize >=8
@@ -983,7 +1190,11 @@ TEXT runtimeÂ·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
 	MOVD	s2_base+16(FP), R6
 	MOVD	s2_len+24(FP), R4
 	MOVD	$ret+32(FP), R7
-	BR	runtimeÂ·cmpbody<>(SB)
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif
 
 TEXT bytesÂ·Compare(SB),NOSPLIT|NOFRAME,$0-56
 	MOVD	s1+0(FP), R5
@@ -991,50 +1202,11 @@ TEXT bytesÂ·Compare(SB),NOSPLIT|NOFRAME,$0-56
 	MOVD	s2+24(FP), R6
 	MOVD	s2+32(FP), R4
 	MOVD	$ret+48(FP), R7
-	BR	runtimeÂ·cmpbody<>(SB)
-
-// On entry:
-// R3 is the length of s1
-// R4 is the length of s2
-// R5 points to the start of s1
-// R6 points to the start of s2
-// R7 points to return value (-1/0/1 will be written here)
-//
-// On exit:
-// R5, R6, R8, R9 and R10 are clobbered
-TEXT runtimeÂ·cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
-	CMP	R5, R6
-	BEQ	samebytes // same starting pointers; compare lengths
-	SUB	$1, R5
-	SUB	$1, R6
-	MOVD	R4, R8
-	CMP	R3, R4
-	BGE	2(PC)
-	MOVD	R3, R8	// R8 is min(R3, R4)
-	ADD	R5, R8	// R5 is current byte in s1, R8 is last byte in s1 to compare
-loop:
-	CMP	R5, R8
-	BEQ	samebytes // all compared bytes were the same; compare lengths
-	MOVBZU	1(R5), R9
-	MOVBZU	1(R6), R10
-	CMP	R9, R10
-	BEQ	loop
-	// bytes differed
-	MOVD	$1, R4
-	BGT	2(PC)
-	NEG	R4
-	MOVD	R4, (R7)
-	RET
-samebytes:
-	MOVD	$1, R8
-	CMP	R3, R4
-	BNE	3(PC)
-	MOVD	R0, (R7)
-	RET
-	BGT	2(PC)
-	NEG	R8
-	MOVD	R8, (R7)
-	RET
+#ifdef	GOARCH_ppc64le
+	BR	cmpbodyLE<>(SB)
+#else
+	BR      cmpbodyBE<>(SB)
+#endif
 
 TEXT runtimeÂ·fastrand(SB), NOSPLIT, $0-4
 	MOVD	g_m(g), R4
-- 
2.50.0