]> Cypherpunks repositories - gostls13.git/commitdiff
bytes: improve performance for bytes.Compare on ppc64x
authorLynn Boger <laboger@linux.vnet.ibm.com>
Thu, 13 Oct 2016 17:59:07 +0000 (12:59 -0500)
committerMichael Munday <munday@ca.ibm.com>
Mon, 17 Oct 2016 18:46:20 +0000 (18:46 +0000)
This improves the performance for byte.Compare by rewriting
the cmpbody function in runtime/asm_ppc64x.s.  The previous code
had a simple loop which loaded a pair of bytes and compared them,
which is inefficient for long buffers.  The updated function checks
for 8 or 32 byte chunks and then loads and compares double words where
possible.

Because the byte.Compare result indicates greater or less than,
the doubleword loads must take endianness into account, using a
byte reversed load in the little endian case.

Fixes #17433

benchmark                                   old ns/op     new ns/op     delta
BenchmarkBytesCompare/8-16                  13.6          7.16          -47.35%
BenchmarkBytesCompare/16-16                 25.7          7.83          -69.53%
BenchmarkBytesCompare/32-16                 38.1          7.78          -79.58%
BenchmarkBytesCompare/64-16                 63.0          10.6          -83.17%
BenchmarkBytesCompare/128-16                112           13.0          -88.39%
BenchmarkBytesCompare/256-16                211           28.1          -86.68%
BenchmarkBytesCompare/512-16                410           38.6          -90.59%
BenchmarkBytesCompare/1024-16               807           60.2          -92.54%
BenchmarkBytesCompare/2048-16               1601          103           -93.57%

Change-Id: I121acc74fcd27c430797647b8d682eb0607c63eb
Reviewed-on: https://go-review.googlesource.com/30949
Reviewed-by: David Chase <drchase@google.com>
src/cmd/internal/obj/ppc64/a.out.go
src/cmd/internal/obj/ppc64/anames.go
src/cmd/internal/obj/ppc64/asm9.go
src/runtime/asm_ppc64x.s

index 556ea3760875c04112b25364faa3083235097652..ac1126e8f20e38d00ac7ca81fa5252b02f08f724 100644 (file)
@@ -424,6 +424,7 @@ const (
        ALSW
        ALWAR
        ALWSYNC
+       AMOVDBR
        AMOVWBR
        AMOVB
        AMOVBU
index 1d766a2d6bd0a1341be0407b8539afe5845f9047..9e26666dbe1e53f7ae74cc9cab41dc8d671bebb8 100644 (file)
@@ -125,6 +125,7 @@ var Anames = []string{
        "LSW",
        "LWAR",
        "LWSYNC",
+       "MOVDBR",
        "MOVWBR",
        "MOVB",
        "MOVBU",
index d36e32ff0d69745a0f6af71e5874d4a5ed205e6c..7c9d83a7d1b587e6df7b895c52f8c5f58f9acffd 100644 (file)
@@ -1513,6 +1513,7 @@ func buildop(ctxt *obj.Link) {
 
                case AMOVHBR:
                        opset(AMOVWBR, r0)
+                       opset(AMOVDBR, r0)
 
                case ASLBMFEE:
                        opset(ASLBMFEV, r0)
@@ -3923,6 +3924,8 @@ func oploadx(ctxt *obj.Link, a obj.As) uint32 {
                return OPVCC(31, 790, 0, 0) /* lhbrx */
        case AMOVWBR:
                return OPVCC(31, 534, 0, 0) /* lwbrx */
+       case AMOVDBR:
+               return OPVCC(31, 532, 0, 0) /* ldbrx */
        case AMOVHZ:
                return OPVCC(31, 279, 0, 0) /* lhzx */
        case AMOVHZU:
index b5cd12bb3c6e348fe02a59f235102e76aab6824f..8b5ea4508278f5638341937874332c97999499c5 100644 (file)
@@ -812,13 +812,220 @@ eq:
        MOVB    R3, ret+16(FP)
        RET
 
-// Do an efficieint memequal for ppc64
-// for reuse where possible.
+// Do an efficient memcmp for ppc64le
+// R3 = s1 len
+// R4 = s2 len
+// R5 = s1 addr
+// R6 = s2 addr
+// R7 = addr of return value
+TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+       MOVD    R3,R8           // set up length
+       CMP     R3,R4,CR2       // unequal?
+       BC      12,8,setuplen   // BLT CR2
+       MOVD    R4,R8           // use R4 for comparison len
+setuplen:
+       MOVD    R8,CTR          // set up loop counter
+       CMP     R8,$8           // only optimize >=8
+       BLT     simplecheck
+       DCBT    (R5)            // cache hint
+       DCBT    (R6)
+       CMP     R8,$32          // optimize >= 32
+       MOVD    R8,R9
+       BLT     setup8a         // 8 byte moves only
+setup32a:
+       SRADCC  $5,R8,R9        // number of 32 byte chunks
+       MOVD    R9,CTR
+
+        // Special processing for 32 bytes or longer.
+        // Loading this way is faster and correct as long as the
+       // doublewords being compared are equal. Once they
+       // are found unequal, reload them in proper byte order
+       // to determine greater or less than.
+loop32a:
+       MOVD    0(R5),R9        // doublewords to compare
+       MOVD    0(R6),R10       // get 4 doublewords
+       MOVD    8(R5),R14
+       MOVD    8(R6),R15
+       CMPU    R9,R10          // bytes equal?
+       MOVD    $0,R16          // set up for cmpne
+       BNE     cmpne           // further compare for LT or GT
+       MOVD    16(R5),R9       // get next pair of doublewords
+       MOVD    16(R6),R10
+       CMPU    R4,R15          // bytes match?
+       MOVD    $8,R16          // set up for cmpne
+       BNE     cmpne           // further compare for LT or GT
+       MOVD    24(R5),R14      // get next pair of doublewords
+       MOVD    24(R6),R15
+       CMPU    R9,R10          // bytes match?
+       MOVD    $16,R16         // set up for cmpne
+       BNE     cmpne           // further compare for LT or GT
+       MOVD    $-8,R16         // for cmpne, R5,R6 already inc by 32
+       ADD     $32,R5          // bump up to next 32
+       ADD     $32,R6
+       CMPU    R14,R15         // bytes match?
+       BC      8,2,loop32a     // br ctr and cr
+       BNE     cmpne
+       ANDCC   $24,R8,R9       // Any 8 byte chunks?
+       BEQ     leftover        // and result is 0
+setup8a:
+       SRADCC  $3,R9,R9        // get the 8 byte count
+       BEQ     leftover        // shifted value is 0
+       MOVD    R9,CTR          // loop count for doublewords
+loop8:
+       MOVDBR  (R5+R0),R9      // doublewords to compare
+       MOVDBR  (R6+R0),R10     // LE compare order
+       ADD     $8,R5
+       ADD     $8,R6
+       CMPU    R9,R10          // match?
+       BC      8,2,loop8       // bt ctr <> 0 && cr
+       BGT     greater
+       BLT     less
+leftover:
+       ANDCC   $7,R8,R9        // check for leftover bytes
+       MOVD    R9,CTR          // save the ctr
+       BNE     simple          // leftover bytes
+       BC      12,10,equal     // test CR2 for length comparison
+       BC      12,8,less
+       BR      greater
+simplecheck:
+       CMP     R8,$0           // remaining compare length 0
+       BNE     simple          // do simple compare
+       BC      12,10,equal     // test CR2 for length comparison
+       BC      12,8,less       // 1st len < 2nd len, result less
+       BR      greater         // 1st len > 2nd len must be greater
+simple:
+       MOVBZ   0(R5), R9       // get byte from 1st operand
+       ADD     $1,R5
+       MOVBZ   0(R6), R10      // get byte from 2nd operand
+       ADD     $1,R6
+       CMPU    R9, R10
+       BC      8,2,simple      // bc ctr <> 0 && cr
+       BGT     greater         // 1st > 2nd
+       BLT     less            // 1st < 2nd
+       BC      12,10,equal     // test CR2 for length comparison
+       BC      12,9,greater    // 2nd len > 1st len
+       BR      less            // must be less
+cmpne:                         // only here is not equal
+       MOVDBR  (R5+R16),R8     // reload in reverse order
+       MOVDBR  (R6+R16),R9
+       CMPU    R8,R9           // compare correct endianness
+       BGT     greater         // here only if NE
+less:
+       MOVD    $-1,R3
+       MOVD    R3,(R7)         // return value if A < B
+       RET
+equal:
+       MOVD    $0,(R7)         // return value if A == B
+       RET
+greater:
+       MOVD    $1,R3
+       MOVD    R3,(R7)         // return value if A > B
+       RET
+
+// Do an efficient memcmp for ppc64 (BE)
+// R3 = s1 len
+// R4 = s2 len
+// R5 = s1 addr
+// R6 = s2 addr
+// R7 = addr of return value
+TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+       MOVD    R3,R8           // set up length
+       CMP     R3,R4,CR2       // unequal?
+       BC      12,8,setuplen   // BLT CR2
+       MOVD    R4,R8           // use R4 for comparison len
+setuplen:
+       MOVD    R8,CTR          // set up loop counter
+       CMP     R8,$8           // only optimize >=8
+       BLT     simplecheck
+       DCBT    (R5)            // cache hint
+       DCBT    (R6)
+       CMP     R8,$32          // optimize >= 32
+       MOVD    R8,R9
+       BLT     setup8a         // 8 byte moves only
+
+setup32a:
+       SRADCC  $5,R8,R9        // number of 32 byte chunks
+       MOVD    R9,CTR
+loop32a:
+       MOVD    0(R5),R9        // doublewords to compare
+       MOVD    0(R6),R10       // get 4 doublewords
+       MOVD    8(R5),R14
+       MOVD    8(R6),R15
+       CMPU    R9,R10          // bytes equal?
+       BLT     less            // found to be less
+       BGT     greater         // found to be greater
+       MOVD    16(R5),R9       // get next pair of doublewords
+       MOVD    16(R6),R10
+       CMPU    R14,R15         // bytes match?
+       BLT     less            // found less
+       BGT     greater         // found greater
+       MOVD    24(R5),R14      // get next pair of doublewords
+       MOVD    24(R6),R15
+       CMPU    R9,R10          // bytes match?
+       BLT     less            // found to be less
+       BGT     greater         // found to be greater
+       ADD     $32,R5          // bump up to next 32
+       ADD     $32,R6
+       CMPU    R14,R15         // bytes match?
+       BC      8,2,loop32a     // br ctr and cr
+       BLT     less            // with BE, byte ordering is
+       BGT     greater         // good for compare
+       ANDCC   $24,R8,R9       // Any 8 byte chunks?
+       BEQ     leftover        // and result is 0
+setup8a:
+       SRADCC  $3,R9,R9        // get the 8 byte count
+       BEQ     leftover        // shifted value is 0
+       MOVD    R9,CTR          // loop count for doublewords
+loop8:
+       MOVD    (R5),R9
+       MOVD    (R6),R10
+       ADD     $8,R5
+       ADD     $8,R6
+       CMPU    R9,R10          // match?
+       BC      8,2,loop8       // bt ctr <> 0 && cr
+       BGT     greater
+       BLT     less
+leftover:
+       ANDCC   $7,R8,R9        // check for leftover bytes
+       MOVD    R9,CTR          // save the ctr
+       BNE     simple          // leftover bytes
+       BC      12,10,equal     // test CR2 for length comparison
+       BC      12,8,less
+       BR      greater
+simplecheck:
+       CMP     R8,$0           // remaining compare length 0
+       BNE     simple          // do simple compare
+       BC      12,10,equal     // test CR2 for length comparison
+       BC      12,8,less       // 1st len < 2nd len, result less
+       BR      greater         // same len, must be equal
+simple:
+       MOVBZ   0(R5),R9        // get byte from 1st operand
+       ADD     $1,R5
+       MOVBZ   0(R6),R10       // get byte from 2nd operand
+       ADD     $1,R6
+       CMPU    R9,R10
+       BC      8,2,simple      // bc ctr <> 0 && cr
+       BGT     greater         // 1st > 2nd
+       BLT     less            // 1st < 2nd
+       BC      12,10,equal     // test CR2 for length comparison
+       BC      12,9,greater    // 2nd len > 1st len
+less:
+       MOVD    $-1,R3
+       MOVD    R3,(R7)         // return value if A < B
+       RET
+equal:
+       MOVD    $0,(R7)         // return value if A == B
+       RET
+greater:
+       MOVD    $1,R3
+       MOVD    R3,(R7)         // return value if A > B
+       RET
+
+// Do an efficient memequal for ppc64
 // R3 = s1
 // R4 = s2
 // R5 = len
 // R9 = return value
-// R6, R7 clobbered
 TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
        MOVD    R5,CTR
        CMP     R5,$8           // only optimize >=8
@@ -983,7 +1190,11 @@ TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
        MOVD    s2_base+16(FP), R6
        MOVD    s2_len+24(FP), R4
        MOVD    $ret+32(FP), R7
-       BR      runtime·cmpbody<>(SB)
+#ifdef GOARCH_ppc64le
+       BR      cmpbodyLE<>(SB)
+#else
+       BR      cmpbodyBE<>(SB)
+#endif
 
 TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
        MOVD    s1+0(FP), R5
@@ -991,50 +1202,11 @@ TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
        MOVD    s2+24(FP), R6
        MOVD    s2+32(FP), R4
        MOVD    $ret+48(FP), R7
-       BR      runtime·cmpbody<>(SB)
-
-// On entry:
-// R3 is the length of s1
-// R4 is the length of s2
-// R5 points to the start of s1
-// R6 points to the start of s2
-// R7 points to return value (-1/0/1 will be written here)
-//
-// On exit:
-// R5, R6, R8, R9 and R10 are clobbered
-TEXT runtime·cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
-       CMP     R5, R6
-       BEQ     samebytes // same starting pointers; compare lengths
-       SUB     $1, R5
-       SUB     $1, R6
-       MOVD    R4, R8
-       CMP     R3, R4
-       BGE     2(PC)
-       MOVD    R3, R8  // R8 is min(R3, R4)
-       ADD     R5, R8  // R5 is current byte in s1, R8 is last byte in s1 to compare
-loop:
-       CMP     R5, R8
-       BEQ     samebytes // all compared bytes were the same; compare lengths
-       MOVBZU  1(R5), R9
-       MOVBZU  1(R6), R10
-       CMP     R9, R10
-       BEQ     loop
-       // bytes differed
-       MOVD    $1, R4
-       BGT     2(PC)
-       NEG     R4
-       MOVD    R4, (R7)
-       RET
-samebytes:
-       MOVD    $1, R8
-       CMP     R3, R4
-       BNE     3(PC)
-       MOVD    R0, (R7)
-       RET
-       BGT     2(PC)
-       NEG     R8
-       MOVD    R8, (R7)
-       RET
+#ifdef GOARCH_ppc64le
+       BR      cmpbodyLE<>(SB)
+#else
+       BR      cmpbodyBE<>(SB)
+#endif
 
 TEXT runtime·fastrand(SB), NOSPLIT, $0-4
        MOVD    g_m(g), R4