]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize indexbyte function for ppc64le/power9
authorArchana R <aravind5@in.ibm.com>
Fri, 1 Apr 2022 17:05:07 +0000 (12:05 -0500)
committerLynn Boger <laboger@linux.vnet.ibm.com>
Fri, 15 Apr 2022 16:05:09 +0000 (16:05 +0000)
Added specific code For POWER9 that does not need prealignment prior
to load vector. Optimized vector loop to jump out as soon as there is
a match instead of accumulating matches for 4 indices and then processing
the same. For small input size 10, the caller function dominates
performance.

name                      old time/op    new time/op    delta
IndexByte/10                9.20ns ± 0%   10.40ns ± 0%  +13.08%
IndexByte/32                9.77ns ± 0%    9.20ns ± 0%   -5.84%
IndexByte/4K                 171ns ± 0%     136ns ± 0%  -20.51%
IndexByte/4M                 154µs ± 0%     126µs ± 0%  -17.92%
IndexByte/64M               2.48ms ± 0%    2.03ms ± 0%  -18.27%
IndexAnyASCII/1:32          10.2ns ± 1%     9.2ns ± 0%   -9.19%
IndexAnyASCII/1:64          11.3ns ± 0%    10.1ns ± 0%  -11.29%
IndexAnyUTF8/1:64           11.4ns ± 0%     9.8ns ± 0%  -13.73%
IndexAnyUTF8/16:64           156ns ± 1%     131ns ± 0%  -16.23%
IndexAnyUTF8/256:64         2.27µs ± 0%    1.86µs ± 0%  -18.03%
LastIndexAnyUTF8/1:64       11.8ns ± 0%    10.5ns ± 0%  -10.81%
LastIndexAnyUTF8/16:64       165ns ±11%     132ns ± 0%  -19.75%
LastIndexAnyUTF8/256:2      1.68µs ± 0%    1.44µs ± 0%  -14.33%
LastIndexAnyUTF8/256:4      1.68µs ± 0%    1.49µs ± 0%  -11.10%
LastIndexAnyUTF8/256:8      1.68µs ± 0%    1.50µs ± 0%  -11.05%
LastIndexAnyUTF8/256:64     2.30µs ± 0%    1.90µs ± 0%  -17.56%
Change-Id: I3d2550bdfdea38fece2da9960bbe62fe6cb1840c
Reviewed-on: https://go-review.googlesource.com/c/go/+/397614
Reviewed-by: Paul Murphy <murp@ibm.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Run-TryBot: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Russ Cox <rsc@golang.org>
src/internal/bytealg/indexbyte_ppc64x.s

index 4cc2b440876e54dcefbef9e99b50ffe60505feb7..1a6e852d67258f107270cbce9bbb0c7bff4e841f 100644 (file)
@@ -11,17 +11,20 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
        // R3 = byte array pointer
        // R4 = length
        MOVD    R6, R5          // R5 = byte
+       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
        BR      indexbytebody<>(SB)
 
 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
        // R3 = string
        // R4 = length
        // R5 = byte
+       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
        BR      indexbytebody<>(SB)
 
 // R3 = addr of string
 // R4 = len of string
 // R5 = byte to find
+// R16 = 1 if running on a POWER9 system, 0 otherwise
 // On exit:
 // R3 = return value
 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
@@ -29,12 +32,11 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
        RLDICR  $0,R3,$60,R8    // Align address to doubleword boundary in R8.
        RLDIMI  $8,R5,$48,R5    // Replicating the byte across the register.
        ADD     R4,R3,R7        // Last acceptable address in R7.
-       DCBT    (R8)            // Prepare cache line.
 
        RLDIMI  $16,R5,$32,R5
        CMPU    R4,$32          // Check if it's a small string (≤32 bytes). Those will be processed differently.
        MOVD    $-1,R9
-       WORD    $0x54661EB8     // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+       RLWNM   $3,R3,$26,$28,R6        // shift amount for mask (r3&0x7)*8
        RLDIMI  $32,R5,$0,R5
        MOVD    R7,R10          // Save last acceptable address in R10 for later.
        ADD     $-1,R7,R7
@@ -43,8 +45,77 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
 #else
        SRD     R6,R9,R9        // Same for Big Endian
 #endif
-       BLE     small_string    // Jump to the small string case if it's ≤32 bytes.
-
+       BLT     small_string    // Jump to the small string case if it's <32 bytes.
+       CMP     R16,$1          // optimize for power8 v power9
+       BNE     power8
+       VSPLTISB        $3,V10  // Use V10 as control for VBPERMQ
+       MTVRD   R5,V1
+       LVSL    (R0+R0),V11     // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
+       VSLB    V11,V10,V10     // to extract the first bit of match result into GPR
+       VSPLTB  $7,V1,V1        // Replicate byte across V1
+       CMP     R4,$64
+       MOVD    $16,R11
+       MOVD    R3,R8
+       BLT     cmp32
+       MOVD    $32,R12
+       MOVD    $48,R6
+
+loop64:
+       LXVB16X (R0)(R8),V2     // scan 64 bytes at a time
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0    // match found at R8, jump out
+
+       LXVB16X (R8)(R11),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat1    // match found at R8+16 bytes, jump out
+
+       LXVB16X (R8)(R12),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat2    // match found at R8+32 bytes, jump out
+
+       LXVB16X (R8)(R6),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat3    // match found at R8+48 bytes, jump out
+       ADD     $64,R8
+       ADD     $-64,R4
+       CMP     R4,$64          // >=64 bytes left to scan?
+       BGE     loop64
+       CMP     R4,$32
+       BLT     rem             // jump to rem if there are < 32 bytes left
+cmp32:
+       LXVB16X (R0)(R8),V2     // 32-63 bytes left
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0    // match found at R8
+
+       LXVB16X (R11)(R8),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat1    // match found at R8+16
+
+       ADD     $32,R8
+       ADD     $-32,R4
+rem:
+       RLDICR  $0,R8,$60,R8    // align address to reuse code for tail end processing
+       BR      small_string
+
+foundat3:
+       ADD     $16,R8
+foundat2:
+       ADD     $16,R8
+foundat1:
+       ADD     $16,R8
+foundat0:
+       // Compress the result into a single doubleword and
+       // move it to a GPR for the final calculation.
+       VBPERMQ V6,V10,V6
+       MFVRD   V6,R3
+       // count leading zeroes upto the match that ends up in low 16 bits
+       // in both endian modes, compute index by subtracting the number by 16
+       CNTLZW  R3,R11
+       ADD     $-16,R11
+       ADD     R8,R11,R3       // Calculate byte address
+       SUB     R17,R3
+       RET
+power8:
        // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
        // in V0, V1 and V10, then branch to the preloop.
        ANDCC   $63,R3,R11
@@ -54,7 +125,6 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
        MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
        CMPB    R12,R5,R3       // Check for a match.
        AND     R9,R3,R3        // Mask bytes below s_base
-       RLDICL  $0,R7,$61,R6    // length-1
        RLDICR  $0,R7,$60,R7    // Last doubleword in R7
        CMPU    R3,$0,CR7       // If we have a match, jump to the final computation
        BNE     CR7,done
@@ -252,8 +322,13 @@ found_qw_align:
        CMPU      R11,R4
        BLT       return
        BR        notfound
+       PCALIGN   $16
 
 done:
+       ADD     $-1,R10,R6
+       // Offset of last index for the final
+       // doubleword comparison
+       RLDICL  $0,R6,$61,R6
        // At this point, R3 has 0xFF in the same position as the byte we are
        // looking for in the doubleword. Use that to calculate the exact index
        // of the byte.
@@ -273,6 +348,7 @@ done:
        BR      notfound
 
 small_string:
+       // process string of length < 32 bytes
        // We unroll this loop for better performance.
        CMPU    R4,$0           // Check for length=0
        BEQ     notfound
@@ -281,7 +357,6 @@ small_string:
        CMPB    R12,R5,R3       // Check for a match.
        AND     R9,R3,R3        // Mask bytes below s_base.
        CMPU    R3,$0,CR7       // If we have a match, jump to the final computation.
-       RLDICL  $0,R7,$61,R6    // length-1
        RLDICR  $0,R7,$60,R7    // Last doubleword in R7.
        CMPU    R8,R7
        BNE     CR7,done