// R3 = byte array pointer
// R4 = length
MOVD R6, R5 // R5 = byte
- MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
BR indexbytebody<>(SB)
TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
// R3 = string
// R4 = length
// R5 = byte
- MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
BR indexbytebody<>(SB)
+#ifndef GOPPC64_power9
+#ifdef GOARCH_ppc64le
+DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
+DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
+#else
+DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
+DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
+#endif
+GLOBL indexbytevbperm<>+0(SB), RODATA, $16
+#endif
+
+// Some operations are endian specific, choose the correct opcode base on GOARCH.
+// Note, _VCZBEBB is only available on power9 and newer.
+#ifdef GOARCH_ppc64le
+#define _LDBEX MOVDBR
+#define _LWBEX MOVWBR
+#define _LHBEX MOVHBR
+#define _VCZBEBB VCTZLSBB
+#else
+#define _LDBEX MOVD
+#define _LWBEX MOVW
+#define _LHBEX MOVH
+#define _VCZBEBB VCLZLSBB
+#endif
+
// R3 = addr of string
// R4 = len of string
// R5 = byte to find
-// R16 = 1 if running on a POWER9 system, 0 otherwise
// On exit:
// R3 = return value
TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
- MOVD R3,R17 // Save base address for calculating the index later.
- RLDICR $0,R3,$60,R8 // Align address to doubleword boundary in R8.
- RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
- ADD R4,R3,R7 // Last acceptable address in R7.
+ CMPU R4,$32
- RLDIMI $16,R5,$32,R5
- CMPU R4,$32 // Check if it's a small string (≤32 bytes). Those will be processed differently.
- MOVD $-1,R9
- RLWNM $3,R3,$26,$28,R6 // shift amount for mask (r3&0x7)*8
- RLDIMI $32,R5,$0,R5
- MOVD R7,R10 // Save last acceptable address in R10 for later.
- ADD $-1,R7,R7
-#ifdef GOARCH_ppc64le
- SLD R6,R9,R9 // Prepare mask for Little Endian
-#else
- SRD R6,R9,R9 // Same for Big Endian
+#ifndef GOPPC64_power9
+ // Load VBPERMQ constant to reduce compare into an ordered bit mask.
+ MOVD $indexbytevbperm<>+00(SB),R16
+ LXVD2X (R16),V0 // Set up swap string
#endif
- BLT small_string // Jump to the small string case if it's <32 bytes.
- CMP R16,$1 // optimize for power8 v power9
- BNE power8
- VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
+
MTVRD R5,V1
- LVSL (R0+R0),V11 // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
- VSLB V11,V10,V10 // to extract the first bit of match result into GPR
VSPLTB $7,V1,V1 // Replicate byte across V1
- CMP R4,$64
+
+ BLT cmp16 // Jump to the small string case if it's <32 bytes.
+
+ CMP R4,$64,CR1
MOVD $16,R11
MOVD R3,R8
- BLT cmp32
+ BLT CR1,cmp32 // Special case for length 32 - 63
MOVD $32,R12
MOVD $48,R6
+ RLDICR $0,R4,$63-6,R9 // R9 = len &^ 63
+ ADD R3,R9,R9 // R9 = &s[len &^ 63]
+ ANDCC $63,R4 // (len &= 63) cmp 0.
+
+ PCALIGN $16
loop64:
- LXVB16X (R0)(R8),V2 // scan 64 bytes at a time
+ LXVD2X (R0)(R8),V2 // Scan 64 bytes at a time, starting at &s[0]
VCMPEQUBCC V2,V1,V6
- BNE CR6,foundat0 // match found at R8, jump out
+ BNE CR6,foundat0 // Match found at R8, jump out
- LXVB16X (R8)(R11),V2
+ LXVD2X (R11)(R8),V2
VCMPEQUBCC V2,V1,V6
- BNE CR6,foundat1 // match found at R8+16 bytes, jump out
+ BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
- LXVB16X (R8)(R12),V2
+ LXVD2X (R12)(R8),V2
VCMPEQUBCC V2,V1,V6
- BNE CR6,foundat2 // match found at R8+32 bytes, jump out
+ BNE CR6,foundat2 // Match found at R8+32 bytes, jump out
- LXVB16X (R8)(R6),V2
+ LXVD2X (R6)(R8),V2
VCMPEQUBCC V2,V1,V6
- BNE CR6,foundat3 // match found at R8+48 bytes, jump out
+ BNE CR6,foundat3 // Match found at R8+48 bytes, jump out
+
ADD $64,R8
- ADD $-64,R4
- CMP R4,$64 // >=64 bytes left to scan?
- BGE loop64
- CMP R4,$32
- BLT rem // jump to rem if there are < 32 bytes left
-cmp32:
- LXVB16X (R0)(R8),V2 // 32-63 bytes left
+ CMPU R8,R9,CR1
+ BNE CR1,loop64 // R8 != &s[len &^ 63]?
+
+ PCALIGN $32
+ BEQ notfound // Is tail length 0? CR0 is set before entering loop64.
+
+ CMP R4,$32 // Tail length >= 32, use cmp32 path.
+ CMP R4,$16,CR1
+ BGE cmp32
+
+ ADD R8,R4,R9
+ ADD $-16,R9
+ BLE CR1,cmp64_tail_gt0
+
+cmp64_tail_gt16: // Tail length 17 - 32
+ LXVD2X (R0)(R8),V2
VCMPEQUBCC V2,V1,V6
- BNE CR6,foundat0 // match found at R8
+ BNE CR6,foundat0
- LXVB16X (R11)(R8),V2
+cmp64_tail_gt0: // Tail length 1 - 16
+ MOVD R9,R8
+ LXVD2X (R0)(R9),V2
VCMPEQUBCC V2,V1,V6
- BNE CR6,foundat1 // match found at R8+16
+ BNE CR6,foundat0
- ADD $32,R8
- ADD $-32,R4
-rem:
- RLDICR $0,R8,$60,R8 // align address to reuse code for tail end processing
- BR small_string
+ BR notfound
+
+cmp32: // Length 32 - 63
+
+ // Bytes 0 - 15
+ LXVD2X (R0)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0
+
+ // Bytes 16 - 31
+ LXVD2X (R8)(R11),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat1 // Match found at R8+16 bytes, jump out
+
+ BEQ notfound // Is length <= 32? (CR0 holds this comparison on entry to cmp32)
+ CMP R4,$48
+
+ ADD R4,R8,R9 // Compute &s[len(s)-16]
+ ADD $32,R8,R8
+ ADD $-16,R9,R9
+ ISEL CR0GT,R8,R9,R8 // R8 = len(s) <= 48 ? R9 : R8
+
+ // Bytes 33 - 47
+ LXVD2X (R0)(R8),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // match found at R8+32 bytes, jump out
+
+ BLE notfound
+ // Bytes 48 - 63
+ MOVD R9,R8 // R9 holds the final check.
+ LXVD2X (R0)(R9),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
+
+ BR notfound
+
+// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
+#ifndef GOPPC64_power9
+#define ADJUST_FOR_CNTLZW -16
+#else
+#define ADJUST_FOR_CNTLZW 0
+#endif
+
+// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
+// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
foundat3:
- ADD $16,R8
+ SUB R3,R8,R3
+ ADD $48+ADJUST_FOR_CNTLZW,R3
+ BR vfound
foundat2:
- ADD $16,R8
+ SUB R3,R8,R3
+ ADD $32+ADJUST_FOR_CNTLZW,R3
+ BR vfound
foundat1:
- ADD $16,R8
+ SUB R3,R8,R3
+ ADD $16+ADJUST_FOR_CNTLZW,R3
+ BR vfound
foundat0:
- // Compress the result into a single doubleword and
- // move it to a GPR for the final calculation.
- VBPERMQ V6,V10,V6
- MFVRD V6,R3
- // count leading zeroes upto the match that ends up in low 16 bits
- // in both endian modes, compute index by subtracting the number by 16
- CNTLZW R3,R11
- ADD $-16,R11
- ADD R8,R11,R3 // Calculate byte address
- SUB R17,R3
+ SUB R3,R8,R3
+ ADD $0+ADJUST_FOR_CNTLZW,R3
+vfound:
+ // Map equal values into a 16 bit value with earlier matches setting higher bits.
+#ifndef GOPPC64_power9
+ VBPERMQ V6,V0,V6
+ MFVRD V6,R4
+ CNTLZW R4,R4
+#else
+#ifdef GOARCH_ppc64le
+ // Put the value back into LE ordering by swapping doublewords.
+ XXPERMDI V6,V6,$2,V6
+#endif
+ _VCZBEBB V6,R4
+#endif
+ ADD R3,R4,R3
RET
-power8:
- // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
- // in V0, V1 and V10, then branch to the preloop.
- ANDCC $63,R3,R11
- BEQ CR0,qw_align
- RLDICL $0,R3,$61,R11
-
- MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
- CMPB R12,R5,R3 // Check for a match.
- AND R9,R3,R3 // Mask bytes below s_base
- RLDICR $0,R7,$60,R7 // Last doubleword in R7
- CMPU R3,$0,CR7 // If we have a match, jump to the final computation
- BNE CR7,done
- ADD $8,R8,R8
- ADD $-8,R4,R4
- ADD R4,R11,R4
- // Check for quadword alignment
- ANDCC $15,R8,R11
- BEQ CR0,qw_align
+cmp16: // Length 16 - 31
+ CMPU R4,$16
+ ADD R4,R3,R9
+ BLT cmp8
- // Not aligned, so handle the next doubleword
- MOVD 0(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR7
- BNE CR7,done
- ADD $8,R8,R8
- ADD $-8,R4,R4
+ ADD $-16,R9,R9 // &s[len(s)-16]
- // Either quadword aligned or 64-byte at this point. We can use LVX.
-qw_align:
-
- // Set up auxiliary data for the vectorized algorithm.
- VSPLTISB $0,V0 // Replicate 0 across V0
- VSPLTISB $3,V10 // Use V10 as control for VBPERMQ
- MTVRD R5,V1
- LVSL (R0+R0),V11
- VSLB V11,V10,V10
- VSPLTB $7,V1,V1 // Replicate byte across V1
- CMPU R4, $64 // If len ≤ 64, don't use the vectorized loop
- BLE tail
-
- // We will load 4 quardwords per iteration in the loop, so check for
- // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
- ANDCC $63,R8,R11
- BEQ CR0,preloop
-
- // Not 64-byte aligned. Load one quadword at a time until aligned.
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6 // Check for byte in V4
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- ADD $-16,R4,R4
-
- ANDCC $63,R8,R11
- BEQ CR0,preloop
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6 // Check for byte in V4
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- ADD $-16,R4,R4
-
- ANDCC $63,R8,R11
- BEQ CR0,preloop
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6 // Check for byte in V4
- BNE CR6,found_qw_align
- ADD $-16,R4,R4
- ADD $16,R8,R8
-
- // 64-byte aligned. Prepare for the main loop.
-preloop:
- CMPU R4,$64
- BLE tail // If len ≤ 64, don't use the vectorized loop
-
- // We are now aligned to a 64-byte boundary. We will load 4 quadwords
- // per loop iteration. The last doubleword is in R10, so our loop counter
- // starts at (R10-R8)/64.
- SUB R8,R10,R6
- SRD $6,R6,R9 // Loop counter in R9
- MOVD R9,CTR
-
- ADD $-64,R8,R8 // Adjust index for loop entry
- MOVD $16,R11 // Load offsets for the vector loads
- MOVD $32,R9
- MOVD $48,R7
-
- // Main loop we will load 64 bytes per iteration
-loop:
- ADD $64,R8,R8 // Fuse addi+lvx for performance
- LVX (R8+R0),V2 // Load 4 16-byte vectors
- LVX (R8+R11),V3
- VCMPEQUB V1,V2,V6 // Look for byte in each vector
- VCMPEQUB V1,V3,V7
-
- LVX (R8+R9),V4
- LVX (R8+R7),V5
- VCMPEQUB V1,V4,V8
- VCMPEQUB V1,V5,V9
-
- VOR V6,V7,V11 // Compress the result in a single vector
- VOR V8,V9,V12
- VOR V11,V12,V13
- VCMPEQUBCC V0,V13,V14 // Check for byte
- BGE CR6,found
- BC 16,0,loop // bdnz loop
-
- // Handle the tailing bytes or R4 ≤ 64
- RLDICL $0,R6,$58,R4
- ADD $64,R8,R8
-tail:
- CMPU R4,$0
- BEQ notfound
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- CMPU R4,$16,CR6
- BLE CR6,notfound
- ADD $-16,R4,R4
-
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- CMPU R4,$16,CR6
- BLE CR6,notfound
- ADD $-16,R4,R4
-
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
- ADD $16,R8,R8
- CMPU R4,$16,CR6
- BLE CR6,notfound
- ADD $-16,R4,R4
-
- LVX (R8+R0),V4
- VCMPEQUBCC V1,V4,V6
- BNE CR6,found_qw_align
+ // Bytes 0 - 15
+ LXVD2X (R0)(R3),V2
+ VCMPEQUBCC V2,V1,V6
+ MOVD R3,R8
+ BNE CR6,foundat0 // Match found at R8+32 bytes, jump out
-notfound:
- MOVD $-1, R3
- RET
+ BEQ notfound
-found:
- // We will now compress the results into a single doubleword,
- // so it can be moved to a GPR for the final index calculation.
-
- // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
- // first bit of each byte into bits 48-63.
- VBPERMQ V6,V10,V6
- VBPERMQ V7,V10,V7
- VBPERMQ V8,V10,V8
- VBPERMQ V9,V10,V9
-
- // Shift each 16-bit component into its correct position for
- // merging into a single doubleword.
-#ifdef GOARCH_ppc64le
- VSLDOI $2,V7,V7,V7
- VSLDOI $4,V8,V8,V8
- VSLDOI $6,V9,V9,V9
-#else
- VSLDOI $6,V6,V6,V6
- VSLDOI $4,V7,V7,V7
- VSLDOI $2,V8,V8,V8
-#endif
+ // Bytes 16 - 30
+ MOVD R9,R8 // R9 holds the final check.
+ LXVD2X (R0)(R9),V2
+ VCMPEQUBCC V2,V1,V6
+ BNE CR6,foundat0 // Match found at R8+48 bytes, jump out
+
+ BR notfound
- // Merge V6-V9 into a single doubleword and move to a GPR.
- VOR V6,V7,V11
- VOR V8,V9,V4
- VOR V4,V11,V4
- MFVRD V4,R3
-#ifdef GOARCH_ppc64le
- ADD $-1,R3,R11
- ANDN R3,R11,R11
- POPCNTD R11,R11 // Count trailing zeros (Little Endian).
+cmp8: // Length 8 - 15
+#ifdef GOPPC64_power10
+ // Load all the bytes into a single VSR in BE order.
+ SLD $56,R4,R5
+ LXVLL R3,R5,V2
+ // Compare and count the number which don't match.
+ VCMPEQUB V2,V1,V6
+ VCLZLSBB V6,R3
+ // If count is the number of bytes, or more. No matches are found.
+ CMPU R3,R4
+ MOVD $-1,R5
+ // Otherwise, the count is the index of the first match.
+ ISEL CR0LT,R3,R5,R3
+ RET
#else
- CNTLZD R3,R11 // Count leading zeros (Big Endian).
-#endif
- ADD R8,R11,R3 // Calculate byte address
+ RLDIMI $8,R5,$48,R5 // Replicating the byte across the register.
+ RLDIMI $16,R5,$32,R5
+ RLDIMI $32,R5,$0,R5
+ CMPU R4,$8
+ BLT cmp4
+ MOVD $-8,R11
+ ADD $-8,R4,R4
-return:
- SUB R17, R3
+ _LDBEX (R0)(R3),R10
+ _LDBEX (R11)(R9),R11
+ CMPB R10,R5,R10
+ CMPB R11,R5,R11
+ CMPU R10,$0
+ CMPU R11,$0,CR1
+ CNTLZD R10,R10
+ CNTLZD R11,R11
+ SRD $3,R10,R3
+ SRD $3,R11,R11
+ BNE found
+
+ ADD R4,R11,R4
+ MOVD $-1,R3
+ ISEL CR1EQ,R3,R4,R3
RET
-found_qw_align:
- // Use the same algorithm as above. Compress the result into
- // a single doubleword and move it to a GPR for the final
- // calculation.
- VBPERMQ V6,V10,V6
+cmp4: // Length 4 - 7
+ CMPU R4,$4
+ BLT cmp2
+ MOVD $-4,R11
+ ADD $-4,R4,R4
+
+ _LWBEX (R0)(R3),R10
+ _LWBEX (R11)(R9),R11
+ CMPB R10,R5,R10
+ CMPB R11,R5,R11
+ CNTLZW R10,R10
+ CNTLZW R11,R11
+ CMPU R10,$32
+ CMPU R11,$32,CR1
+ SRD $3,R10,R3
+ SRD $3,R11,R11
+ BNE found
-#ifdef GOARCH_ppc64le
- MFVRD V6,R3
- ADD $-1,R3,R11
- ANDN R3,R11,R11
- POPCNTD R11,R11
-#else
- VSLDOI $6,V6,V6,V6
- MFVRD V6,R3
- CNTLZD R3,R11
-#endif
- ADD R8,R11,R3
- CMPU R11,R4
- BLT return
- BR notfound
- PCALIGN $16
-
-done:
- ADD $-1,R10,R6
- // Offset of last index for the final
- // doubleword comparison
- RLDICL $0,R6,$61,R6
- // At this point, R3 has 0xFF in the same position as the byte we are
- // looking for in the doubleword. Use that to calculate the exact index
- // of the byte.
-#ifdef GOARCH_ppc64le
- ADD $-1,R3,R11
- ANDN R3,R11,R11
- POPCNTD R11,R11 // Count trailing zeros (Little Endian).
-#else
- CNTLZD R3,R11 // Count leading zeros (Big Endian).
-#endif
- CMPU R8,R7 // Check if we are at the last doubleword.
- SRD $3,R11 // Convert trailing zeros to bytes.
- ADD R11,R8,R3
- CMPU R11,R6,CR7 // If at the last doubleword, check the byte offset.
- BNE return
- BLE CR7,return
- BR notfound
+ ADD R4,R11,R4
+ MOVD $-1,R3
+ ISEL CR1EQ,R3,R4,R3
+ RET
-small_string:
- // process string of length < 32 bytes
- // We unroll this loop for better performance.
- CMPU R4,$0 // Check for length=0
- BEQ notfound
+cmp2: // Length 2 - 3
+ CMPU R4,$2
+ BLT cmp1
- MOVD 0(R8),R12 // Load one doubleword from the aligned address in R8.
- CMPB R12,R5,R3 // Check for a match.
- AND R9,R3,R3 // Mask bytes below s_base.
- CMPU R3,$0,CR7 // If we have a match, jump to the final computation.
- RLDICR $0,R7,$60,R7 // Last doubleword in R7.
- CMPU R8,R7
- BNE CR7,done
- BEQ notfound // Hit length.
-
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- CMPU R8,R7
- BNE CR6,done
- BEQ notfound
+ _LHBEX (R0)(R3),R10
+ CMPB R10,R5,R10
+ SLDCC $48,R10,R10
+ CNTLZD R10,R10
+ SRD $3,R10,R3
+ BNE found
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- CMPU R8,R7
- BNE CR6,done
- BEQ notfound
+cmp1: // Length 1
+ MOVD $-1,R3
+ ANDCC $1,R4,R31
+ BEQ found
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- CMPU R8,R7
- BNE CR6,done
- BEQ notfound
+ MOVBZ -1(R9),R10
+ CMPB R10,R5,R10
+ ANDCC $1,R10
+ ADD $-1,R4
+ ISEL CR0EQ,R3,R4,R3
- MOVDU 8(R8),R12
- CMPB R12,R5,R3
- CMPU R3,$0,CR6
- BNE CR6,done
- BR notfound
+found:
+ RET
+#endif
+
+notfound:
+ MOVD $-1,R3
+ RET