]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize Count/CountString for PPC64/Power10
authorPaul E. Murphy <murp@ibm.com>
Mon, 17 Jul 2023 20:23:28 +0000 (15:23 -0500)
committerPaul Murphy <murp@ibm.com>
Mon, 14 Aug 2023 20:30:44 +0000 (20:30 +0000)
Power10 adds a handful of new instructions which make this
noticeably quicker for smaller values.

Likewise, since the vector loop requires 32B to enter,
unroll it once to count 32B per iteration. This
improvement benefits all PPC64 cpus.

On Power10 comparing a binary built with GOPPC64=power8

CountSingle/10     8.99ns ± 0%    5.55ns ± 3%   -38.24%
CountSingle/16     7.55ns ± 0%    5.56ns ± 3%   -26.37%
CountSingle/17     7.45ns ± 0%    5.25ns ± 0%   -29.52%
CountSingle/31     18.4ns ± 0%     6.2ns ± 0%   -66.41%
CountSingle/32     6.17ns ± 0%    5.04ns ± 0%   -18.37%
CountSingle/33     7.13ns ± 0%    5.99ns ± 0%   -15.94%
CountSingle/4K      198ns ± 0%     115ns ± 0%   -42.08%
CountSingle/4M      190µs ± 0%     109µs ± 0%   -42.49%
CountSingle/64M    3.28ms ± 0%    2.08ms ± 0%   -36.53%

Furthermore, comparing the new tail implementation on
GOPPC64=power8 with GOPPC64=power10:

CountSingle/10     5.55ns ± 3%    4.52ns ± 1%  -18.66%
CountSingle/16     5.56ns ± 3%    4.80ns ± 0%  -13.65%
CountSingle/17     5.25ns ± 0%    4.79ns ± 0%   -8.78%
CountSingle/31     6.17ns ± 0%    4.82ns ± 0%  -21.79%
CountSingle/32     5.04ns ± 0%    5.09ns ± 6%   +1.01%
CountSingle/33     5.99ns ± 0%    5.42ns ± 2%   -9.54%

Change-Id: I62d80be3b5d706e1abbb4bec7d6278a939a5eed4
Reviewed-on: https://go-review.googlesource.com/c/go/+/512695
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>

src/internal/bytealg/count_ppc64x.s

index 2d2490b0246a4edd4777f9b56484903e9ebf8165..55e02ce8a187ec99364e9a08abd0b9ae70caa15b 100644 (file)
 #include "textflag.h"
 
 TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
-       // R3 = byte array pointer 
+       // R3 = byte array pointer
        // R4 = length
-       MOVBZ R6, R5              // R5 = byte
-       BR    countbytebody<>(SB)
+       // R6 = byte to count
+       MTVRD   R6, V1          // move compare byte
+       MOVD    R6, R5
+       VSPLTB  $7, V1, V1      // replicate byte across V1
+       BR      countbytebody<>(SB)
 
 TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
        // R3 = byte array pointer
        // R4 = length
-       MOVBZ R5, R5              // R5 = byte
-       BR    countbytebody<>(SB)
+       // R5 = byte to count
+       MTVRD   R5, V1          // move compare byte
+       VSPLTB  $7, V1, V1      // replicate byte across V1
+       BR      countbytebody<>(SB)
 
 // R3: addr of string
 // R4: len of string
 // R5: byte to count
+// V1: byte to count, splatted.
 // On exit:
 // R3: return value
-// endianness shouldn't matter since we are just counting and order
-// is irrelevant
 TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
-       DCBT (R3)    // Prepare cache line.
-       MOVD R0, R18 // byte count
-       MOVD R3, R19 // Save base address for calculating the index later.
-       MOVD R4, R16
-
-       MOVD   R5, R6
-       RLDIMI $8, R6, $48, R6
-       RLDIMI $16, R6, $32, R6
-       RLDIMI $32, R6, $0, R6  // fill reg with the byte to count
-
-       VSPLTISW $3, V4     // used for shift
-       MTVRD    R6, V1     // move compare byte
-       VSPLTB   $7, V1, V1 // replicate byte across V1
-
-       CMPU   R4, $32          // Check if it's a small string (<32 bytes)
-       BLT    tail             // Jump to the small string case
-       XXLXOR VS37, VS37, VS37 // clear V5 (aka VS37) to use as accumulator
-
+       MOVD    $0, R18 // byte count
+
+#ifndef GOPPC64_power10
+       RLDIMI  $8, R5, $48, R5
+       RLDIMI  $16, R5, $32, R5
+       RLDIMI  $32, R5, $0, R5 // fill reg with the byte to count
+#endif
+
+       CMPU    R4, $32         // Check if it's a small string (<32 bytes)
+       BLT     tail            // Jump to the small string case
+       SRD     $5, R4, R20
+       MOVD    R20, CTR
+       MOVD    $16, R21
+       XXLXOR  V4, V4, V4
+       XXLXOR  V5, V5, V5
+
+       PCALIGN $16
 cmploop:
-       LXVW4X (R3), VS32 // load bytes from string
-
-       // when the bytes match, the corresponding byte contains all 1s
-       VCMPEQUB V1, V0, V2     // compare bytes
-       VPOPCNTD V2, V3         // each double word contains its count
-       VADDUDM  V3, V5, V5     // accumulate bit count in each double word
-       ADD      $16, R3, R3    // increment pointer
-       SUB      $16, R16, R16  // remaining bytes
-       CMP      R16, $16       // at least 16 remaining?
-       BGE      cmploop
-       VSRD     V5, V4, V5     // shift by 3 to convert bits to bytes
-       VSLDOI   $8, V5, V5, V6 // get the double word values from vector
-       MFVSRD   V5, R9
-       MFVSRD   V6, R10
-       ADD      R9, R10, R9
-       ADD      R9, R18, R18
-
-tail:
-       CMP R16, $8 // 8 bytes left?
-       BLT small
-
-       MOVD    (R3), R12     // load 8 bytes
-       CMPB    R12, R6, R17  // compare bytes
-       POPCNTD R17, R15      // bit count
-       SRD     $3, R15, R15  // byte count
-       ADD     R15, R18, R18 // add to byte count
-
-next1:
-       ADD $8, R3, R3
-       SUB $8, R16, R16 // remaining bytes
-       BR  tail
-
-small:
-       CMP   $0, R16   // any remaining
-       BEQ   done
-       MOVBZ (R3), R12 // check each remaining byte
-       CMP   R12, R5
-       BNE   next2
-       ADD   $1, R18
-
-next2:
-       SUB $1, R16
-       ADD $1, R3  // inc address
-       BR  small
-
-done:
-       MOVD R18, R3    // return count
+       LXVD2X  (R0)(R3), V0    // Count 32B per loop with two vector accumulators.
+       LXVD2X  (R21)(R3), V2
+       VCMPEQUB V2, V1, V2
+       VCMPEQUB V0, V1, V0
+       VPOPCNTD V2, V2         // A match is 0xFF or 0. Count the bits into doubleword buckets.
+       VPOPCNTD V0, V0
+       VADDUDM V0, V4, V4      // Accumulate the popcounts. They are 8x the count.
+       VADDUDM V2, V5, V5      // The count will be fixed up afterwards.
+       ADD     $32, R3
+       BDNZ    cmploop
+
+       VADDUDM V4, V5, V5
+       MFVSRD  V5, R18
+       VSLDOI  $8, V5, V5, V5
+       MFVSRD  V5, R21
+       ADD     R21, R18, R18
+       ANDCC   $31, R4, R4
+       // Skip the tail processing if no bytes remaining.
+       BEQ     tail_0
+
+#ifdef GOPPC64_power10
+       SRD     $3, R18, R18    // Fix the vector loop count before counting the tail on P10.
+
+tail:  // Count the last 0 - 31 bytes.
+       CMP     R4, $16
+       BLE     small_tail_p10
+       LXV     0(R3), V0
+       VCMPEQUB V0, V1, V0
+       VCNTMBB V0, $1, R14     // Sum the value of bit 0 of each byte of the compare into R14.
+       SRD     $56, R14, R14   // The result of VCNTMBB is shifted. Unshift it.
+       ADD     R14, R18, R18
+       ADD     $16, R3, R3
+       ANDCC   $15, R4, R4
+
+small_tail_p10:
+       SLD     $56, R4, R6
+       LXVLL   R3, R6, V0
+       VCMPEQUB V0, V1, V0
+       VCLRRB  V0, R4, V0      // If <16B being compared, clear matches of the 16-R4 bytes.
+       VCNTMBB V0, $1, R14     // Sum the value of bit 0 of each byte of the compare into R14.
+       SRD     $56, R14, R14   // The result of VCNTMBB is shifted. Unshift it.
+       ADD     R14, R18, R3
+       RET
+
+#else
+tail:  // Count the last 0 - 31 bytes.
+       CMP     R4, $16
+       BLT     tail_8
+       MOVD    (R3), R12
+       MOVD    8(R3), R14
+       CMPB    R12, R5, R12
+       CMPB    R14, R5, R14
+       POPCNTD R12, R12
+       POPCNTD R14, R14
+       ADD     R12, R18, R18
+       ADD     R14, R18, R18
+       ADD     $16, R3, R3
+       ADD     $-16, R4, R4
+
+tail_8:        // Count the remaining 0 - 15 bytes.
+       CMP     R4, $8
+       BLT     tail_4
+       MOVD    (R3), R12
+       CMPB    R12, R5, R12
+       POPCNTD R12, R12
+       ADD     R12, R18, R18
+       ADD     $8, R3, R3
+       ADD     $-8, R4, R4
+
+tail_4:        // Count the remaining 0 - 7 bytes.
+       CMP     R4, $4
+       BLT     tail_2
+       MOVWZ   (R3), R12
+       CMPB    R12, R5, R12
+       SLD     $32, R12, R12   // Remove non-participating matches.
+       POPCNTD R12, R12
+       ADD     R12, R18, R18
+       ADD     $4, R3, R3
+       ADD     $-4, R4, R4
+
+tail_2:        // Count the remaining 0 - 3 bytes.
+       CMP     R4, $2
+       BLT     tail_1
+       MOVHZ   (R3), R12
+       CMPB    R12, R5, R12
+       SLD     $48, R12, R12   // Remove non-participating matches.
+       POPCNTD R12, R12
+       ADD     R12, R18, R18
+       ADD     $2, R3, R3
+       ADD     $-2, R4, R4
+
+tail_1:        // Count the remaining 0 - 1 bytes.
+       CMP     R4, $1
+       BLT     tail_0
+       MOVBZ   (R3), R12
+       CMPB    R12, R5, R12
+       ANDCC   $0x8, R12, R12
+       ADD     R12, R18, R18
+#endif
+
+tail_0:        // No remaining tail to count.
+       SRD     $3, R18, R3     // Fixup count, it is off by 8x.
        RET