#include "textflag.h"
TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
- // R3 = byte array pointer
+ // R3 = byte array pointer
// R4 = length
- MOVBZ R6, R5 // R5 = byte
- BR countbytebody<>(SB)
+ // R6 = byte to count
+ MTVRD R6, V1 // move compare byte
+ MOVD R6, R5
+ VSPLTB $7, V1, V1 // replicate byte across V1
+ BR countbytebody<>(SB)
TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
// R3 = byte array pointer
// R4 = length
- MOVBZ R5, R5 // R5 = byte
- BR countbytebody<>(SB)
+ // R5 = byte to count
+ MTVRD R5, V1 // move compare byte
+ VSPLTB $7, V1, V1 // replicate byte across V1
+ BR countbytebody<>(SB)
// R3: addr of string
// R4: len of string
// R5: byte to count
+// V1: byte to count, splatted.
// On exit:
// R3: return value
-// endianness shouldn't matter since we are just counting and order
-// is irrelevant
TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
- DCBT (R3) // Prepare cache line.
- MOVD R0, R18 // byte count
- MOVD R3, R19 // Save base address for calculating the index later.
- MOVD R4, R16
-
- MOVD R5, R6
- RLDIMI $8, R6, $48, R6
- RLDIMI $16, R6, $32, R6
- RLDIMI $32, R6, $0, R6 // fill reg with the byte to count
-
- VSPLTISW $3, V4 // used for shift
- MTVRD R6, V1 // move compare byte
- VSPLTB $7, V1, V1 // replicate byte across V1
-
- CMPU R4, $32 // Check if it's a small string (<32 bytes)
- BLT tail // Jump to the small string case
- XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
-
+ MOVD $0, R18 // byte count
+
+#ifndef GOPPC64_power10
+ RLDIMI $8, R5, $48, R5
+ RLDIMI $16, R5, $32, R5
+ RLDIMI $32, R5, $0, R5 // fill reg with the byte to count
+#endif
+
+ CMPU R4, $32 // Check if it's a small string (<32 bytes)
+ BLT tail // Jump to the small string case
+ SRD $5, R4, R20
+ MOVD R20, CTR
+ MOVD $16, R21
+ XXLXOR V4, V4, V4
+ XXLXOR V5, V5, V5
+
+ PCALIGN $16
cmploop:
- LXVW4X (R3), VS32 // load bytes from string
-
- // when the bytes match, the corresponding byte contains all 1s
- VCMPEQUB V1, V0, V2 // compare bytes
- VPOPCNTD V2, V3 // each double word contains its count
- VADDUDM V3, V5, V5 // accumulate bit count in each double word
- ADD $16, R3, R3 // increment pointer
- SUB $16, R16, R16 // remaining bytes
- CMP R16, $16 // at least 16 remaining?
- BGE cmploop
- VSRD V5, V4, V5 // shift by 3 to convert bits to bytes
- VSLDOI $8, V5, V5, V6 // get the double word values from vector
- MFVSRD V5, R9
- MFVSRD V6, R10
- ADD R9, R10, R9
- ADD R9, R18, R18
-
-tail:
- CMP R16, $8 // 8 bytes left?
- BLT small
-
- MOVD (R3), R12 // load 8 bytes
- CMPB R12, R6, R17 // compare bytes
- POPCNTD R17, R15 // bit count
- SRD $3, R15, R15 // byte count
- ADD R15, R18, R18 // add to byte count
-
-next1:
- ADD $8, R3, R3
- SUB $8, R16, R16 // remaining bytes
- BR tail
-
-small:
- CMP $0, R16 // any remaining
- BEQ done
- MOVBZ (R3), R12 // check each remaining byte
- CMP R12, R5
- BNE next2
- ADD $1, R18
-
-next2:
- SUB $1, R16
- ADD $1, R3 // inc address
- BR small
-
-done:
- MOVD R18, R3 // return count
+ LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators.
+ LXVD2X (R21)(R3), V2
+ VCMPEQUB V2, V1, V2
+ VCMPEQUB V0, V1, V0
+ VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets.
+ VPOPCNTD V0, V0
+ VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count.
+ VADDUDM V2, V5, V5 // The count will be fixed up afterwards.
+ ADD $32, R3
+ BDNZ cmploop
+
+ VADDUDM V4, V5, V5
+ MFVSRD V5, R18
+ VSLDOI $8, V5, V5, V5
+ MFVSRD V5, R21
+ ADD R21, R18, R18
+ ANDCC $31, R4, R4
+ // Skip the tail processing if no bytes remaining.
+ BEQ tail_0
+
+#ifdef GOPPC64_power10
+ SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10.
+
+tail: // Count the last 0 - 31 bytes.
+ CMP R4, $16
+ BLE small_tail_p10
+ LXV 0(R3), V0
+ VCMPEQUB V0, V1, V0
+ VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
+ SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
+ ADD R14, R18, R18
+ ADD $16, R3, R3
+ ANDCC $15, R4, R4
+
+small_tail_p10:
+ SLD $56, R4, R6
+ LXVLL R3, R6, V0
+ VCMPEQUB V0, V1, V0
+ VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes.
+ VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14.
+ SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it.
+ ADD R14, R18, R3
+ RET
+
+#else
+tail: // Count the last 0 - 31 bytes.
+ CMP R4, $16
+ BLT tail_8
+ MOVD (R3), R12
+ MOVD 8(R3), R14
+ CMPB R12, R5, R12
+ CMPB R14, R5, R14
+ POPCNTD R12, R12
+ POPCNTD R14, R14
+ ADD R12, R18, R18
+ ADD R14, R18, R18
+ ADD $16, R3, R3
+ ADD $-16, R4, R4
+
+tail_8: // Count the remaining 0 - 15 bytes.
+ CMP R4, $8
+ BLT tail_4
+ MOVD (R3), R12
+ CMPB R12, R5, R12
+ POPCNTD R12, R12
+ ADD R12, R18, R18
+ ADD $8, R3, R3
+ ADD $-8, R4, R4
+
+tail_4: // Count the remaining 0 - 7 bytes.
+ CMP R4, $4
+ BLT tail_2
+ MOVWZ (R3), R12
+ CMPB R12, R5, R12
+ SLD $32, R12, R12 // Remove non-participating matches.
+ POPCNTD R12, R12
+ ADD R12, R18, R18
+ ADD $4, R3, R3
+ ADD $-4, R4, R4
+
+tail_2: // Count the remaining 0 - 3 bytes.
+ CMP R4, $2
+ BLT tail_1
+ MOVHZ (R3), R12
+ CMPB R12, R5, R12
+ SLD $48, R12, R12 // Remove non-participating matches.
+ POPCNTD R12, R12
+ ADD R12, R18, R18
+ ADD $2, R3, R3
+ ADD $-2, R4, R4
+
+tail_1: // Count the remaining 0 - 1 bytes.
+ CMP R4, $1
+ BLT tail_0
+ MOVBZ (R3), R12
+ CMPB R12, R5, R12
+ ANDCC $0x8, R12, R12
+ ADD R12, R18, R18
+#endif
+
+tail_0: // No remaining tail to count.
+ SRD $3, R18, R3 // Fixup count, it is off by 8x.
RET