internal/bytealg: optimize Count/CountString for PPC64/Power10

author Paul E. Murphy <murp@ibm.com>

Mon, 17 Jul 2023 20:23:28 +0000 (15:23 -0500)

committer Paul Murphy <murp@ibm.com>

Mon, 14 Aug 2023 20:30:44 +0000 (20:30 +0000)
author Paul E. Murphy <murp@ibm.com>
Mon, 17 Jul 2023 20:23:28 +0000 (15:23 -0500)
committer Paul Murphy <murp@ibm.com>
Mon, 14 Aug 2023 20:30:44 +0000 (20:30 +0000)
diff --git a/src/internal/bytealg/count_ppc64x.s b/src/internal/bytealg/count_ppc64x.s

index 2d2490b0246a4edd4777f9b56484903e9ebf8165..55e02ce8a187ec99364e9a08abd0b9ae70caa15b 100644 (file)
--- a/src/internal/bytealg/count_ppc64x.s
+++ b/src/internal/bytealg/count_ppc64x.s
@@ -8,89 +8,147 @@
  #include "textflag.h"
  
  TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
-       // R3 = byte array pointer 
+       // R3 = byte array pointer
         // R4 = length
-       MOVBZ R6, R5              // R5 = byte
-       BR    countbytebody<>(SB)
+       // R6 = byte to count
+       MTVRD   R6, V1          // move compare byte
+       MOVD    R6, R5
+       VSPLTB  $7, V1, V1      // replicate byte across V1
+       BR      countbytebody<>(SB)
  
  TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
         // R3 = byte array pointer
         // R4 = length
-       MOVBZ R5, R5              // R5 = byte
-       BR    countbytebody<>(SB)
+       // R5 = byte to count
+       MTVRD   R5, V1          // move compare byte
+       VSPLTB  $7, V1, V1      // replicate byte across V1
+       BR      countbytebody<>(SB)
  
  // R3: addr of string
  // R4: len of string
  // R5: byte to count
+// V1: byte to count, splatted.
  // On exit:
  // R3: return value
-// endianness shouldn't matter since we are just counting and order
-// is irrelevant
  TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
-       DCBT (R3)    // Prepare cache line.
-       MOVD R0, R18 // byte count
-       MOVD R3, R19 // Save base address for calculating the index later.
-       MOVD R4, R16
-
-       MOVD   R5, R6
-       RLDIMI $8, R6, $48, R6
-       RLDIMI $16, R6, $32, R6
-       RLDIMI $32, R6, $0, R6  // fill reg with the byte to count
-
-       VSPLTISW $3, V4     // used for shift
-       MTVRD    R6, V1     // move compare byte
-       VSPLTB   $7, V1, V1 // replicate byte across V1
-
-       CMPU   R4, $32          // Check if it's a small string (<32 bytes)
-       BLT    tail             // Jump to the small string case
-       XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
-
+       MOVD    $0, R18 // byte count
+
+#ifndef GOPPC64_power10
+       RLDIMI  $8, R5, $48, R5
+       RLDIMI  $16, R5, $32, R5
+       RLDIMI  $32, R5, $0, R5 // fill reg with the byte to count
+#endif
+
+       CMPU    R4, $32         // Check if it's a small string (<32 bytes)
+       BLT     tail            // Jump to the small string case
+       SRD     $5, R4, R20
+       MOVD    R20, CTR
+       MOVD    $16, R21
+       XXLXOR  V4, V4, V4
+       XXLXOR  V5, V5, V5
+
+       PCALIGN $16
  cmploop:
-       LXVW4X (R3), VS32 // load bytes from string
-
-       // when the bytes match, the corresponding byte contains all 1s
-       VCMPEQUB V1, V0, V2     // compare bytes
-       VPOPCNTD V2, V3         // each double word contains its count
-       VADDUDM  V3, V5, V5     // accumulate bit count in each double word
-       ADD      $16, R3, R3    // increment pointer
-       SUB      $16, R16, R16  // remaining bytes
-       CMP      R16, $16       // at least 16 remaining?
-       BGE      cmploop
-       VSRD     V5, V4, V5     // shift by 3 to convert bits to bytes
-       VSLDOI   $8, V5, V5, V6 // get the double word values from vector
-       MFVSRD   V5, R9
-       MFVSRD   V6, R10
-       ADD      R9, R10, R9
-       ADD      R9, R18, R18
-
-tail:
-       CMP R16, $8 // 8 bytes left?
-       BLT small
-
-       MOVD    (R3), R12     // load 8 bytes
-       CMPB    R12, R6, R17  // compare bytes
-       POPCNTD R17, R15      // bit count
-       SRD     $3, R15, R15  // byte count
-       ADD     R15, R18, R18 // add to byte count
-
-next1:
-       ADD $8, R3, R3
-       SUB $8, R16, R16 // remaining bytes
-       BR  tail
-
-small:
-       CMP   $0, R16   // any remaining
-       BEQ   done
-       MOVBZ (R3), R12 // check each remaining byte
-       CMP   R12, R5
-       BNE   next2
-       ADD   $1, R18
-
-next2:
-       SUB $1, R16
-       ADD $1, R3  // inc address
-       BR  small
-
-done:
-       MOVD R18, R3    // return count
+       LXVD2X  (R0)(R3), V0    // Count 32B per loop with two vector accumulators.
+       LXVD2X  (R21)(R3), V2
+       VCMPEQUB V2, V1, V2
+       VCMPEQUB V0, V1, V0
+       VPOPCNTD V2, V2         // A match is 0xFF or 0. Count the bits into doubleword buckets.
+       VPOPCNTD V0, V0
+       VADDUDM V0, V4, V4      // Accumulate the popcounts. They are 8x the count.
+       VADDUDM V2, V5, V5      // The count will be fixed up afterwards.
+       ADD     $32, R3
+       BDNZ    cmploop
+
+       VADDUDM V4, V5, V5
+       MFVSRD  V5, R18
+       VSLDOI  $8, V5, V5, V5
+       MFVSRD  V5, R21
+       ADD     R21, R18, R18
+       ANDCC   $31, R4, R4
+       // Skip the tail processing if no bytes remaining.
+       BEQ     tail_0
+
+#ifdef GOPPC64_power10
+       SRD     $3, R18, R18    // Fix the vector loop count before counting the tail on P10.
+
+tail:  // Count the last 0 - 31 bytes.
+       CMP     R4, $16
+       BLE     small_tail_p10
+       LXV     0(R3), V0
+       VCMPEQUB V0, V1, V0
+       VCNTMBB V0, $1, R14     // Sum the value of bit 0 of each byte of the compare into R14.
+       SRD     $56, R14, R14   // The result of VCNTMBB is shifted. Unshift it.
+       ADD     R14, R18, R18
+       ADD     $16, R3, R3
+       ANDCC   $15, R4, R4
+
+small_tail_p10:
+       SLD     $56, R4, R6
+       LXVLL   R3, R6, V0
+       VCMPEQUB V0, V1, V0
+       VCLRRB  V0, R4, V0      // If <16B being compared, clear matches of the 16-R4 bytes.
+       VCNTMBB V0, $1, R14     // Sum the value of bit 0 of each byte of the compare into R14.
+       SRD     $56, R14, R14   // The result of VCNTMBB is shifted. Unshift it.
+       ADD     R14, R18, R3
+       RET
+
+#else
+tail:  // Count the last 0 - 31 bytes.
+       CMP     R4, $16
+       BLT     tail_8
+       MOVD    (R3), R12
+       MOVD    8(R3), R14
+       CMPB    R12, R5, R12
+       CMPB    R14, R5, R14
+       POPCNTD R12, R12
+       POPCNTD R14, R14
+       ADD     R12, R18, R18
+       ADD     R14, R18, R18
+       ADD     $16, R3, R3
+       ADD     $-16, R4, R4
+
+tail_8:        // Count the remaining 0 - 15 bytes.
+       CMP     R4, $8
+       BLT     tail_4
+       MOVD    (R3), R12
+       CMPB    R12, R5, R12
+       POPCNTD R12, R12
+       ADD     R12, R18, R18
+       ADD     $8, R3, R3
+       ADD     $-8, R4, R4
+
+tail_4:        // Count the remaining 0 - 7 bytes.
+       CMP     R4, $4
+       BLT     tail_2
+       MOVWZ   (R3), R12
+       CMPB    R12, R5, R12
+       SLD     $32, R12, R12   // Remove non-participating matches.
+       POPCNTD R12, R12
+       ADD     R12, R18, R18
+       ADD     $4, R3, R3
+       ADD     $-4, R4, R4
+
+tail_2:        // Count the remaining 0 - 3 bytes.
+       CMP     R4, $2
+       BLT     tail_1
+       MOVHZ   (R3), R12
+       CMPB    R12, R5, R12
+       SLD     $48, R12, R12   // Remove non-participating matches.
+       POPCNTD R12, R12
+       ADD     R12, R18, R18
+       ADD     $2, R3, R3
+       ADD     $-2, R4, R4
+
+tail_1:        // Count the remaining 0 - 1 bytes.
+       CMP     R4, $1
+       BLT     tail_0
+       MOVBZ   (R3), R12
+       CMPB    R12, R5, R12
+       ANDCC   $0x8, R12, R12
+       ADD     R12, R18, R18
+#endif
+
+tail_0:        // No remaining tail to count.
+       SRD     $3, R18, R3     // Fixup count, it is off by 8x.
         RET
author	Paul E. Murphy <murp@ibm.com>
	Mon, 17 Jul 2023 20:23:28 +0000 (15:23 -0500)
committer	Paul Murphy <murp@ibm.com>
	Mon, 14 Aug 2023 20:30:44 +0000 (20:30 +0000)