]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: improve IndexByte for ppc64x
authorCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Tue, 28 Feb 2017 00:32:29 +0000 (21:32 -0300)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Thu, 16 Mar 2017 13:54:20 +0000 (13:54 +0000)
This change adds a better implementation of IndexByte for ppc64x.

Improvement for bytes·IndexByte:

benchmark                             old ns/op     new ns/op     delta
BenchmarkIndexByte/10-16              12.5          8.48          -32.16%
BenchmarkIndexByte/32-16              34.4          9.85          -71.37%
BenchmarkIndexByte/4K-16              3089          217           -92.98%
BenchmarkIndexByte/4M-16              3154810       207051        -93.44%
BenchmarkIndexByte/64M-16             50564811      5579093       -88.97%

benchmark                             old MB/s     new MB/s     speedup
BenchmarkIndexByte/10-16              800.41       1179.64      1.47x
BenchmarkIndexByte/32-16              930.60       3249.10      3.49x
BenchmarkIndexByte/4K-16              1325.71      18832.53     14.21x
BenchmarkIndexByte/4M-16              1329.49      20257.29     15.24x
BenchmarkIndexByte/64M-16             1327.19      12028.63     9.06x

Improvement for strings·IndexByte:

benchmark                             old ns/op     new ns/op     delta
BenchmarkIndexByte-16                 25.9          7.69          -70.31%

Fixes #19030

Change-Id: Ifb82bbb3d643ec44b98eaa2d08a07f47e5c2fd11
Reviewed-on: https://go-review.googlesource.com/37670
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
src/runtime/asm_ppc64x.s

index 4ab5dec5cd0a39dece793d44318d7addd48c969f..caa000bb56d239470ef6164c9369cda3987e81a2 100644 (file)
@@ -1113,53 +1113,183 @@ equal:
        MOVBZ   R3,ret+48(FP)
        RET
 
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
-       MOVD    s+0(FP), R3
-       MOVD    s_len+8(FP), R4
-       MOVBZ   c+24(FP), R5    // byte to find
-       MOVD    R3, R6          // store base for later
-       SUB     $1, R3
-       ADD     R3, R4          // end-1
+TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+       MOVD    s+0(FP), R3             // R3 = byte array pointer
+       MOVD    s_len+8(FP), R4         // R4 = length
+       MOVBZ   c+24(FP), R5            // R5 = byte
+       MOVD    $ret+32(FP), R14        // R14 = &ret
+       BR      runtime·indexbytebody<>(SB)
+
+TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
+       MOVD    s+0(FP), R3       // R3 = string
+       MOVD    s_len+8(FP), R4   // R4 = length
+       MOVBZ   c+16(FP), R5      // R5 = byte
+       MOVD    $ret+24(FP), R14  // R14 = &ret
+       BR      runtime·indexbytebody<>(SB)
+
+TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+       DCBT    (R3)            // Prepare cache line.
+       MOVD    R3,R10          // Save base address for calculating the index later.
+       RLDICR  $0,R3,$60,R8    // Align address to doubleword boundary in R8.
+       RLDIMI  $8,R5,$48,R5    // Replicating the byte across the register.
+
+       // Calculate last acceptable address and check for possible overflow
+       // using a saturated add.
+       // Overflows set last acceptable address to 0xffffffffffffffff.
+       ADD     R4,R3,R7
+       SUBC    R3,R7,R6
+       SUBE    R0,R0,R9
+       MOVW    R9,R6
+       OR      R6,R7,R7
+
+       RLDIMI  $16,R5,$32,R5
+       CMPU    R4,$32          // Check if it's a small string (<32 bytes). Those will be processed differently.
+       MOVD    $-1,R9
+       WORD $0x54661EB8        // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+       RLDIMI  $32,R5,$0,R5
+       ADD     $-1,R7,R7
+#ifdef GOARCH_ppc64le
+       SLD     R6,R9,R9        // Prepare mask for Little Endian
+#else
+       SRD     R6,R9,R9        // Same for Big Endian
+#endif
+       BLE     small_string    // Jump to the small string case if it's <32 bytes.
+
+       // Case for length >32 bytes
+       MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
+       CMPB    R12,R5,R3       // Check for a match.
+       AND     R9,R3,R3        // Mask bytes below s_base
+       RLDICL  $0,R7,$61,R4    // length-1
+       RLDICR  $0,R7,$60,R7    // Last doubleword in R7
+       CMPU    R3,$0,CR7       // If we have a match, jump to the final computation
+       BNE     CR7,done
+
+       // Check for doubleword alignment and jump to the loop setup if aligned.
+       MOVFL   R8,CR7
+       BC      12,28,loop_setup
+
+       // Not aligned, so handle the second doubleword
+       MOVDU   8(R8),R12
+       CMPB    R12,R5,R3
+       CMPU    R3,$0,CR7
+       BNE     CR7,done
+
+loop_setup:
+       // We are now aligned to a 16-byte boundary. We will load two doublewords
+       // per loop iteration. The last doubleword is in R7, so our loop counter
+       // starts at (R7-R8)/16.
+       SUB     R8,R7,R6
+       SRD     $4,R6,R6
+       MOVD    R6,CTR
 
+       // Note: when we have an align directive, align this loop to 32 bytes so
+       // it fits in a single icache sector.
 loop:
-       CMP     R3, R4
+       // Load two doublewords, then compare and merge in a single register. We
+       // will check two doublewords per iteration, then find out which of them
+       // contains the byte later. This speeds up the search.
+       MOVD    8(R8),R12
+       MOVDU   16(R8),R11
+       CMPB    R12,R5,R3
+       CMPB    R11,R5,R9
+       OR      R3,R9,R6
+       CMPU    R6,$0,CR7
+       BNE     CR7,found
+       BC      16,0,loop
+
+       // Counter zeroed, but we may have another doubleword to read
+       CMPU    R8,R7
        BEQ     notfound
-       MOVBZU  1(R3), R7
-       CMP     R7, R5
-       BNE     loop
 
-       SUB     R6, R3          // remove base
-       MOVD    R3, ret+32(FP)
-       RET
+       MOVDU   8(R8),R12
+       CMPB    R12,R5,R3
+       CMPU    R3,$0,CR6
+       BNE     CR6,done
 
 notfound:
-       MOVD    $-1, R3
-       MOVD    R3, ret+32(FP)
+       MOVD    $-1,R3
+       MOVD    R3,(R14)
        RET
 
-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
-       MOVD    p+0(FP), R3
-       MOVD    b_len+8(FP), R4
-       MOVBZ   c+16(FP), R5    // byte to find
-       MOVD    R3, R6          // store base for later
-       SUB     $1, R3
-       ADD     R3, R4          // end-1
+found:
+       // One of the doublewords from the loop contains the byte we are looking
+       // for. Check the first doubleword and adjust the address if found.
+       CMPU    R3,$0,CR6
+       ADD     $-8,R8,R8
+       BNE     CR6,done
+
+       // Not found, so it must be in the second doubleword of the merged pair.
+       MOVD    R9,R3
+       ADD     $8,R8,R8
+
+done:
+       // At this point, R3 has 0xFF in the same position as the byte we are
+       // looking for in the doubleword. Use that to calculate the exact index
+       // of the byte.
+#ifdef GOARCH_ppc64le
+       ADD     $-1,R3,R11
+       ANDN    R3,R11,R11
+       POPCNTD R11,R11         // Count trailing zeros (Little Endian).
+#else
+       CNTLZD  R3,R11          // Count leading zeros (Big Endian).
+#endif
+       CMPU    R8,R7           // Check if we are at the last doubleword.
+       SRD     $3,R11          // Convert trailing zeros to bytes.
+       ADD     R11,R8,R3
+       CMPU    R11,R4,CR7      // If at the last doubleword, check the byte offset.
+       BNE     return
+       BLE     CR7,return
+       MOVD    $-1,R3
+       MOVD    R3,(R14)
+       RET
 
-loop:
-       CMP     R3, R4
+return:
+       SUB     R10,R3          // Calculate index.
+       MOVD    R3,(R14)
+       RET
+
+small_string:
+       // We unroll this loop for better performance.
+       CMPU    R4,$0           // Check for length=0
        BEQ     notfound
-       MOVBZU  1(R3), R7
-       CMP     R7, R5
-       BNE     loop
 
-       SUB     R6, R3          // remove base
-       MOVD    R3, ret+24(FP)
-       RET
+       MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
+       CMPB    R12,R5,R3       // Check for a match.
+       AND     R9,R3,R3        // Mask bytes below s_base.
+       CMPU    R3,$0,CR7       // If we have a match, jump to the final computation.
+       RLDICL  $0,R7,$61,R4    // length-1
+       RLDICR  $0,R7,$60,R7    // Last doubleword in R7.
+        CMPU   R8,R7
+       BNE     CR7,done
+       BEQ     notfound        // Hit length.
+
+       MOVDU   8(R8),R12
+       CMPB    R12,R5,R3
+       CMPU    R3,$0,CR6
+       CMPU    R8,R7
+       BNE     CR6,done
+       BEQ     notfound
 
-notfound:
-       MOVD    $-1, R3
-       MOVD    R3, ret+24(FP)
-       RET
+       MOVDU   8(R8),R12
+       CMPB    R12,R5,R3
+       CMPU    R3,$0,CR6
+       CMPU    R8,R7
+       BNE     CR6,done
+       BEQ     notfound
+
+       MOVDU   8(R8),R12
+       CMPB    R12,R5,R3
+       CMPU    R3,$0,CR6
+       CMPU    R8,R7
+       BNE     CR6,done
+       BEQ     notfound
+
+       MOVDU   8(R8),R12
+       CMPB    R12,R5,R3
+       CMPU    R3,$0,CR6
+       CMPU    R8,R7
+       BNE     CR6,done
+       BR      notfound
 
 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
        MOVD    s1_base+0(FP), R5