From: Archana R Date: Fri, 1 Apr 2022 17:05:07 +0000 (-0500) Subject: internal/bytealg: optimize indexbyte function for ppc64le/power9 X-Git-Tag: go1.19beta1~657 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=5c707f5f3ace728f08997960ec67d9f55cdbf1a3;p=gostls13.git internal/bytealg: optimize indexbyte function for ppc64le/power9 Added specific code For POWER9 that does not need prealignment prior to load vector. Optimized vector loop to jump out as soon as there is a match instead of accumulating matches for 4 indices and then processing the same. For small input size 10, the caller function dominates performance. name old time/op new time/op delta IndexByte/10 9.20ns ± 0% 10.40ns ± 0% +13.08% IndexByte/32 9.77ns ± 0% 9.20ns ± 0% -5.84% IndexByte/4K 171ns ± 0% 136ns ± 0% -20.51% IndexByte/4M 154µs ± 0% 126µs ± 0% -17.92% IndexByte/64M 2.48ms ± 0% 2.03ms ± 0% -18.27% IndexAnyASCII/1:32 10.2ns ± 1% 9.2ns ± 0% -9.19% IndexAnyASCII/1:64 11.3ns ± 0% 10.1ns ± 0% -11.29% IndexAnyUTF8/1:64 11.4ns ± 0% 9.8ns ± 0% -13.73% IndexAnyUTF8/16:64 156ns ± 1% 131ns ± 0% -16.23% IndexAnyUTF8/256:64 2.27µs ± 0% 1.86µs ± 0% -18.03% LastIndexAnyUTF8/1:64 11.8ns ± 0% 10.5ns ± 0% -10.81% LastIndexAnyUTF8/16:64 165ns ±11% 132ns ± 0% -19.75% LastIndexAnyUTF8/256:2 1.68µs ± 0% 1.44µs ± 0% -14.33% LastIndexAnyUTF8/256:4 1.68µs ± 0% 1.49µs ± 0% -11.10% LastIndexAnyUTF8/256:8 1.68µs ± 0% 1.50µs ± 0% -11.05% LastIndexAnyUTF8/256:64 2.30µs ± 0% 1.90µs ± 0% -17.56% Change-Id: I3d2550bdfdea38fece2da9960bbe62fe6cb1840c Reviewed-on: https://go-review.googlesource.com/c/go/+/397614 Reviewed-by: Paul Murphy Reviewed-by: Cherry Mui Run-TryBot: Archana Ravindar TryBot-Result: Gopher Robot Reviewed-by: Russ Cox --- diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s index 4cc2b44087..1a6e852d67 100644 --- a/src/internal/bytealg/indexbyte_ppc64x.s +++ b/src/internal/bytealg/indexbyte_ppc64x.s @@ -11,17 +11,20 @@ TEXT ·IndexByte(SB),NOSPLIT|NOFRAME,$0-40 // R3 = byte array pointer // R4 = length MOVD R6, R5 // R5 = byte + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 BR indexbytebody<>(SB) TEXT ·IndexByteString(SB),NOSPLIT|NOFRAME,$0-32 // R3 = string // R4 = length // R5 = byte + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 BR indexbytebody<>(SB) // R3 = addr of string // R4 = len of string // R5 = byte to find +// R16 = 1 if running on a POWER9 system, 0 otherwise // On exit: // R3 = return value TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 @@ -29,12 +32,11 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8. RLDIMI $8,R5,$48,R5 // Replicating the byte across the register. ADD R4,R3,R7 // Last acceptable address in R7. - DCBT (R8) // Prepare cache line. RLDIMI $16,R5,$32,R5 CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently. MOVD $-1,R9 - WORD $0x54661EB8 // Calculate padding in R6 (rlwinm r6,r3,3,26,28). + RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8 RLDIMI $32,R5,$0,R5 MOVD R7,R10 // Save last acceptable address in R10 for later. ADD $-1,R7,R7 @@ -43,8 +45,77 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 #else SRD R6,R9,R9 // Same for Big Endian #endif - BLE small_string // Jump to the small string case if it's ≤32 bytes. - + BLT small_string // Jump to the small string case if it's <32 bytes. + CMP R16,$1 // optimize for power8 v power9 + BNE power8 + VSPLTISB $3,V10 // Use V10 as control for VBPERMQ + MTVRD R5,V1 + LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0} + VSLB V11,V10,V10 // to extract the first bit of match result into GPR + VSPLTB $7,V1,V1 // Replicate byte across V1 + CMP R4,$64 + MOVD $16,R11 + MOVD R3,R8 + BLT cmp32 + MOVD $32,R12 + MOVD $48,R6 + +loop64: + LXVB16X (R0)(R8),V2 // scan 64 bytes at a time + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8, jump out + + LXVB16X (R8)(R11),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // match found at R8+16 bytes, jump out + + LXVB16X (R8)(R12),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat2 // match found at R8+32 bytes, jump out + + LXVB16X (R8)(R6),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat3 // match found at R8+48 bytes, jump out + ADD $64,R8 + ADD $-64,R4 + CMP R4,$64 // >=64 bytes left to scan? + BGE loop64 + CMP R4,$32 + BLT rem // jump to rem if there are < 32 bytes left +cmp32: + LXVB16X (R0)(R8),V2 // 32-63 bytes left + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat0 // match found at R8 + + LXVB16X (R11)(R8),V2 + VCMPEQUBCC V2,V1,V6 + BNE CR6,foundat1 // match found at R8+16 + + ADD $32,R8 + ADD $-32,R4 +rem: + RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing + BR small_string + +foundat3: + ADD $16,R8 +foundat2: + ADD $16,R8 +foundat1: + ADD $16,R8 +foundat0: + // Compress the result into a single doubleword and + // move it to a GPR for the final calculation. + VBPERMQ V6,V10,V6 + MFVRD V6,R3 + // count leading zeroes upto the match that ends up in low 16 bits + // in both endian modes, compute index by subtracting the number by 16 + CNTLZW R3,R11 + ADD $-16,R11 + ADD R8,R11,R3 // Calculate byte address + SUB R17,R3 + RET +power8: // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values // in V0, V1 and V10, then branch to the preloop. ANDCC $63,R3,R11 @@ -54,7 +125,6 @@ TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8. CMPB R12,R5,R3 // Check for a match. AND R9,R3,R3 // Mask bytes below s_base - RLDICL $0,R7,$61,R6 // length-1 RLDICR $0,R7,$60,R7 // Last doubleword in R7 CMPU R3,$0,CR7 // If we have a match, jump to the final computation BNE CR7,done @@ -252,8 +322,13 @@ found_qw_align: CMPU R11,R4 BLT return BR notfound + PCALIGN $16 done: + ADD $-1,R10,R6 + // Offset of last index for the final + // doubleword comparison + RLDICL $0,R6,$61,R6 // At this point, R3 has 0xFF in the same position as the byte we are // looking for in the doubleword. Use that to calculate the exact index // of the byte. @@ -273,6 +348,7 @@ done: BR notfound small_string: + // process string of length < 32 bytes // We unroll this loop for better performance. CMPU R4,$0 // Check for length=0 BEQ notfound @@ -281,7 +357,6 @@ small_string: CMPB R12,R5,R3 // Check for a match. AND R9,R3,R3 // Mask bytes below s_base. CMPU R3,$0,CR7 // If we have a match, jump to the final computation. - RLDICL $0,R7,$61,R6 // length-1 RLDICR $0,R7,$60,R7 // Last doubleword in R7. CMPU R8,R7 BNE CR7,done