]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: rewrite indexbytebody on PPC64
authorPaul E. Murphy <murp@ibm.com>
Mon, 6 Mar 2023 22:51:31 +0000 (16:51 -0600)
committerPaul Murphy <murp@ibm.com>
Fri, 21 Apr 2023 16:10:29 +0000 (16:10 +0000)
Use P8 instructions throughout to be backwards compatible, but
otherwise not impede performance. Use overlapping loads where
possible, and prioritize larger checks over smaller check.

However, some newer instructions can be used surgically when
targeting a newer GOPPC64. These can lead to noticeable
performance improvements with minimal impact to readability.

All tests run below on a Power10/ppc64le, and use a small
modification to BenchmarkIndexByte to ensure the IndexByte
wrapper call is inlined (as it likely is under realistic usage).
This wrapper adds substantial overhead if not inlined.

Previous (power9 path, GOPPC64=power8) vs. GOPPC64=power8:

IndexByte/1       3.81ns ± 8%     3.11ns ± 5%  -18.39%
IndexByte/2       3.82ns ± 3%     3.20ns ± 6%  -16.23%
IndexByte/3       3.61ns ± 4%     3.25ns ± 6%  -10.13%
IndexByte/4       3.66ns ± 5%     3.08ns ± 1%  -15.91%
IndexByte/5       3.82ns ± 0%     3.75ns ± 2%   -1.94%
IndexByte/6       3.83ns ± 0%     3.87ns ± 4%   +1.04%
IndexByte/7       3.83ns ± 0%     3.82ns ± 0%   -0.27%
IndexByte/8       3.82ns ± 0%     2.92ns ±11%  -23.70%
IndexByte/9       3.70ns ± 2%     3.08ns ± 2%  -16.87%
IndexByte/10      3.74ns ± 2%     3.04ns ± 0%  -18.75%
IndexByte/11      3.75ns ± 0%     3.31ns ± 8%  -11.79%
IndexByte/12      3.74ns ± 0%     3.04ns ± 0%  -18.86%
IndexByte/13      3.83ns ± 4%     3.04ns ± 0%  -20.64%
IndexByte/14      3.80ns ± 1%     3.30ns ± 8%  -13.18%
IndexByte/15      3.77ns ± 1%     3.04ns ± 0%  -19.33%
IndexByte/16      3.81ns ± 0%     2.78ns ± 7%  -26.88%
IndexByte/17      4.12ns ± 0%     3.04ns ± 1%  -26.11%
IndexByte/18      4.27ns ± 6%     3.05ns ± 0%  -28.64%
IndexByte/19      4.30ns ± 4%     3.02ns ± 2%  -29.65%
IndexByte/20      4.43ns ± 7%     3.45ns ± 7%  -22.15%
IndexByte/21      4.12ns ± 0%     3.03ns ± 1%  -26.35%
IndexByte/22      4.40ns ± 6%     3.05ns ± 0%  -30.82%
IndexByte/23      4.40ns ± 6%     3.01ns ± 2%  -31.48%
IndexByte/24      4.32ns ± 5%     3.07ns ± 0%  -28.98%
IndexByte/25      4.76ns ± 2%     3.04ns ± 1%  -36.11%
IndexByte/26      4.82ns ± 0%     3.05ns ± 0%  -36.66%
IndexByte/27      4.82ns ± 0%     2.97ns ± 3%  -38.39%
IndexByte/28      4.82ns ± 0%     2.96ns ± 3%  -38.57%
IndexByte/29      4.82ns ± 0%     3.34ns ± 9%  -30.71%
IndexByte/30      4.82ns ± 0%     3.05ns ± 0%  -36.77%
IndexByte/31      4.81ns ± 0%     3.05ns ± 0%  -36.70%
IndexByte/32      3.52ns ± 0%     3.44ns ± 1%   -2.15%
IndexByte/33      4.77ns ± 1%     3.35ns ± 0%  -29.81%
IndexByte/34      5.01ns ± 5%     3.35ns ± 0%  -33.15%
IndexByte/35      4.92ns ± 9%     3.35ns ± 0%  -31.89%
IndexByte/36      4.81ns ± 5%     3.35ns ± 0%  -30.37%
IndexByte/37      4.99ns ± 6%     3.35ns ± 0%  -32.86%
IndexByte/38      5.06ns ± 5%     3.35ns ± 0%  -33.84%
IndexByte/39      5.02ns ± 5%     3.48ns ± 9%  -30.58%
IndexByte/40      5.21ns ± 9%     3.55ns ± 4%  -31.82%
IndexByte/41      5.18ns ± 0%     3.42ns ± 2%  -33.98%
IndexByte/42      5.19ns ± 0%     3.55ns ±11%  -31.56%
IndexByte/43      5.18ns ± 0%     3.45ns ± 5%  -33.46%
IndexByte/44      5.18ns ± 0%     3.39ns ± 0%  -34.56%
IndexByte/45      5.18ns ± 0%     3.43ns ± 4%  -33.74%
IndexByte/46      5.18ns ± 0%     3.47ns ± 1%  -33.03%
IndexByte/47      5.18ns ± 0%     3.44ns ± 2%  -33.54%
IndexByte/48      5.18ns ± 0%     3.39ns ± 0%  -34.52%
IndexByte/49      5.69ns ± 0%     3.79ns ± 0%  -33.45%
IndexByte/50      5.70ns ± 0%     3.70ns ± 3%  -34.98%
IndexByte/51      5.70ns ± 0%     3.70ns ± 2%  -35.05%
IndexByte/52      5.69ns ± 0%     3.80ns ± 1%  -33.35%
IndexByte/53      5.69ns ± 0%     3.78ns ± 0%  -33.54%
IndexByte/54      5.69ns ± 0%     3.78ns ± 1%  -33.51%
IndexByte/55      5.69ns ± 0%     3.78ns ± 0%  -33.61%
IndexByte/56      5.69ns ± 0%     3.81ns ± 3%  -33.12%
IndexByte/57      6.20ns ± 0%     3.79ns ± 4%  -38.89%
IndexByte/58      6.20ns ± 0%     3.74ns ± 2%  -39.58%
IndexByte/59      6.20ns ± 0%     3.69ns ± 2%  -40.47%
IndexByte/60      6.20ns ± 0%     3.79ns ± 1%  -38.81%
IndexByte/61      6.20ns ± 0%     3.77ns ± 1%  -39.23%
IndexByte/62      6.20ns ± 0%     3.79ns ± 0%  -38.89%
IndexByte/63      6.20ns ± 0%     3.79ns ± 0%  -38.90%
IndexByte/64      4.17ns ± 0%     3.47ns ± 3%  -16.70%
IndexByte/65      5.38ns ± 0%     4.21ns ± 0%  -21.59%
IndexByte/66      5.38ns ± 0%     4.21ns ± 0%  -21.58%
IndexByte/67      5.38ns ± 0%     4.22ns ± 0%  -21.58%
IndexByte/68      5.38ns ± 0%     4.22ns ± 0%  -21.59%
IndexByte/69      5.38ns ± 0%     4.22ns ± 0%  -21.56%
IndexByte/70      5.38ns ± 0%     4.21ns ± 0%  -21.59%
IndexByte/71      5.37ns ± 0%     4.21ns ± 0%  -21.51%
IndexByte/72      5.37ns ± 0%     4.22ns ± 0%  -21.46%
IndexByte/73      5.71ns ± 0%     4.22ns ± 0%  -26.20%
IndexByte/74      5.71ns ± 0%     4.21ns ± 0%  -26.21%
IndexByte/75      5.71ns ± 0%     4.21ns ± 0%  -26.17%
IndexByte/76      5.71ns ± 0%     4.22ns ± 0%  -26.22%
IndexByte/77      5.71ns ± 0%     4.22ns ± 0%  -26.22%
IndexByte/78      5.71ns ± 0%     4.21ns ± 0%  -26.22%
IndexByte/79      5.71ns ± 0%     4.22ns ± 0%  -26.21%
IndexByte/80      5.71ns ± 0%     4.21ns ± 0%  -26.19%
IndexByte/81      6.20ns ± 0%     4.39ns ± 0%  -29.13%
IndexByte/82      6.20ns ± 0%     4.36ns ± 0%  -29.67%
IndexByte/83      6.20ns ± 0%     4.36ns ± 0%  -29.63%
IndexByte/84      6.20ns ± 0%     4.39ns ± 0%  -29.21%
IndexByte/85      6.20ns ± 0%     4.36ns ± 0%  -29.64%
IndexByte/86      6.20ns ± 0%     4.36ns ± 0%  -29.63%
IndexByte/87      6.20ns ± 0%     4.39ns ± 0%  -29.21%
IndexByte/88      6.20ns ± 0%     4.36ns ± 0%  -29.65%
IndexByte/89      6.74ns ± 0%     4.36ns ± 0%  -35.33%
IndexByte/90      6.75ns ± 0%     4.37ns ± 0%  -35.22%
IndexByte/91      6.74ns ± 0%     4.36ns ± 0%  -35.30%
IndexByte/92      6.74ns ± 0%     4.36ns ± 0%  -35.34%
IndexByte/93      6.74ns ± 0%     4.37ns ± 0%  -35.20%
IndexByte/94      6.74ns ± 0%     4.36ns ± 0%  -35.33%
IndexByte/95      6.75ns ± 0%     4.36ns ± 0%  -35.32%
IndexByte/96      4.83ns ± 0%     4.34ns ± 2%  -10.24%
IndexByte/97      5.91ns ± 0%     4.65ns ± 0%  -21.24%
IndexByte/98      5.91ns ± 0%     4.65ns ± 0%  -21.24%
IndexByte/99      5.91ns ± 0%     4.65ns ± 0%  -21.23%
IndexByte/100     5.90ns ± 0%     4.65ns ± 0%  -21.21%
IndexByte/101     5.90ns ± 0%     4.65ns ± 0%  -21.22%
IndexByte/102     5.90ns ± 0%     4.65ns ± 0%  -21.23%
IndexByte/103     5.91ns ± 0%     4.65ns ± 0%  -21.23%
IndexByte/104     5.91ns ± 0%     4.65ns ± 0%  -21.24%
IndexByte/105     6.25ns ± 0%     4.65ns ± 0%  -25.59%
IndexByte/106     6.25ns ± 0%     4.65ns ± 0%  -25.59%
IndexByte/107     6.25ns ± 0%     4.65ns ± 0%  -25.60%
IndexByte/108     6.25ns ± 0%     4.65ns ± 0%  -25.58%
IndexByte/109     6.24ns ± 0%     4.65ns ± 0%  -25.50%
IndexByte/110     6.25ns ± 0%     4.65ns ± 0%  -25.56%
IndexByte/111     6.25ns ± 0%     4.65ns ± 0%  -25.60%
IndexByte/112     6.25ns ± 0%     4.65ns ± 0%  -25.59%
IndexByte/113     6.76ns ± 0%     5.05ns ± 0%  -25.37%
IndexByte/114     6.76ns ± 0%     5.05ns ± 0%  -25.31%
IndexByte/115     6.76ns ± 0%     5.05ns ± 0%  -25.38%
IndexByte/116     6.76ns ± 0%     5.05ns ± 0%  -25.31%
IndexByte/117     6.76ns ± 0%     5.05ns ± 0%  -25.38%
IndexByte/118     6.76ns ± 0%     5.05ns ± 0%  -25.31%
IndexByte/119     6.76ns ± 0%     5.05ns ± 0%  -25.38%
IndexByte/120     6.76ns ± 0%     5.05ns ± 0%  -25.36%
IndexByte/121     7.35ns ± 0%     5.05ns ± 0%  -31.33%
IndexByte/122     7.36ns ± 0%     5.05ns ± 0%  -31.42%
IndexByte/123     7.38ns ± 0%     5.05ns ± 0%  -31.60%
IndexByte/124     7.38ns ± 0%     5.05ns ± 0%  -31.59%
IndexByte/125     7.38ns ± 0%     5.05ns ± 0%  -31.60%
IndexByte/126     7.38ns ± 0%     5.05ns ± 0%  -31.58%
IndexByte/128     5.28ns ± 0%     5.10ns ± 0%   -3.41%
IndexByte/256     7.27ns ± 0%     7.28ns ± 2%   +0.13%
IndexByte/512     12.1ns ± 0%     11.8ns ± 0%   -2.51%
IndexByte/1K      23.1ns ± 3%     22.0ns ± 0%   -4.66%
IndexByte/2K      42.6ns ± 0%     42.4ns ± 0%   -0.41%
IndexByte/4K      90.3ns ± 0%     89.4ns ± 0%   -0.98%
IndexByte/8K       170ns ± 0%      170ns ± 0%   -0.59%
IndexByte/16K      331ns ± 0%      330ns ± 0%   -0.27%
IndexByte/32K      660ns ± 0%      660ns ± 0%   -0.08%
IndexByte/64K     1.30µs ± 0%     1.30µs ± 0%   -0.08%
IndexByte/128K    2.58µs ± 0%     2.58µs ± 0%   -0.04%
IndexByte/256K    5.15µs ± 0%     5.15µs ± 0%   -0.04%
IndexByte/512K    10.3µs ± 0%     10.3µs ± 0%   -0.03%
IndexByte/1M      20.6µs ± 0%     20.5µs ± 0%   -0.03%
IndexByte/2M      41.1µs ± 0%     41.1µs ± 0%   -0.03%
IndexByte/4M      82.2µs ± 0%     82.1µs ± 0%   -0.02%
IndexByte/8M       164µs ± 0%      164µs ± 0%   -0.01%
IndexByte/16M      328µs ± 0%      328µs ± 0%   -0.01%
IndexByte/32M      657µs ± 0%      657µs ± 0%   -0.00%

GOPPC64=power8 vs GOPPC64=power9. The Improvement is
most noticed between 16 and 64B, and goes away around
128B.

IndexByte/16      2.78ns ± 7%     2.65ns ±15%   -4.74%
IndexByte/17      3.04ns ± 1%     2.80ns ± 3%   -7.85%
IndexByte/18      3.05ns ± 0%     2.71ns ± 4%  -11.00%
IndexByte/19      3.02ns ± 2%     2.76ns ±10%   -8.74%
IndexByte/20      3.45ns ± 7%     2.91ns ± 0%  -15.46%
IndexByte/21      3.03ns ± 1%     2.84ns ± 9%   -6.33%
IndexByte/22      3.05ns ± 0%     2.67ns ± 1%  -12.38%
IndexByte/23      3.01ns ± 2%     2.67ns ± 1%  -11.24%
IndexByte/24      3.07ns ± 0%     2.92ns ±12%   -4.79%
IndexByte/25      3.04ns ± 1%     3.15ns ±15%   +3.63%
IndexByte/26      3.05ns ± 0%     2.83ns ±13%   -7.33%
IndexByte/27      2.97ns ± 3%     2.98ns ±10%   +0.56%
IndexByte/28      2.96ns ± 3%     2.96ns ± 9%   -0.05%
IndexByte/29      3.34ns ± 9%     3.03ns ±12%   -9.33%
IndexByte/30      3.05ns ± 0%     2.68ns ± 1%  -12.05%
IndexByte/31      3.05ns ± 0%     2.83ns ±12%   -7.27%
IndexByte/32      3.44ns ± 1%     3.21ns ±10%   -6.78%
IndexByte/33      3.35ns ± 0%     3.41ns ± 2%   +1.95%
IndexByte/34      3.35ns ± 0%     3.13ns ± 0%   -6.53%
IndexByte/35      3.35ns ± 0%     3.13ns ± 0%   -6.54%
IndexByte/36      3.35ns ± 0%     3.13ns ± 0%   -6.52%
IndexByte/37      3.35ns ± 0%     3.13ns ± 0%   -6.52%
IndexByte/38      3.35ns ± 0%     3.24ns ± 4%   -3.30%
IndexByte/39      3.48ns ± 9%     3.44ns ± 2%   -1.19%
IndexByte/40      3.55ns ± 4%     3.46ns ± 2%   -2.44%
IndexByte/41      3.42ns ± 2%     3.39ns ± 4%   -0.86%
IndexByte/42      3.55ns ±11%     3.46ns ± 1%   -2.65%
IndexByte/43      3.45ns ± 5%     3.44ns ± 2%   -0.31%
IndexByte/44      3.39ns ± 0%     3.43ns ± 3%   +1.23%
IndexByte/45      3.43ns ± 4%     3.50ns ± 1%   +2.07%
IndexByte/46      3.47ns ± 1%     3.46ns ± 2%   -0.31%
IndexByte/47      3.44ns ± 2%     3.47ns ± 1%   +0.78%
IndexByte/48      3.39ns ± 0%     3.46ns ± 2%   +1.96%
IndexByte/49      3.79ns ± 0%     3.47ns ± 0%   -8.41%
IndexByte/50      3.70ns ± 3%     3.64ns ± 5%   -1.66%
IndexByte/51      3.70ns ± 2%     3.75ns ± 0%   +1.40%
IndexByte/52      3.80ns ± 1%     3.77ns ± 0%   -0.70%
IndexByte/53      3.78ns ± 0%     3.77ns ± 0%   -0.46%
IndexByte/54      3.78ns ± 1%     3.53ns ± 7%   -6.74%
IndexByte/55      3.78ns ± 0%     3.47ns ± 0%   -8.17%
IndexByte/56      3.81ns ± 3%     3.45ns ± 0%   -9.43%
IndexByte/57      3.79ns ± 4%     3.47ns ± 0%   -8.45%
IndexByte/58      3.74ns ± 2%     3.55ns ± 4%   -5.16%
IndexByte/59      3.69ns ± 2%     3.61ns ± 4%   -2.01%
IndexByte/60      3.79ns ± 1%     3.45ns ± 0%   -9.09%
IndexByte/61      3.77ns ± 1%     3.47ns ± 0%   -7.93%
IndexByte/62      3.79ns ± 0%     3.45ns ± 0%   -8.97%
IndexByte/63      3.79ns ± 0%     3.47ns ± 0%   -8.44%
IndexByte/64      3.47ns ± 3%     3.18ns ± 0%   -8.41%

GOPPC64=power9 vs GOPPC64=power10. Only sizes <16 will
show meaningful changes.

IndexByte/1       3.27ns ± 8%     2.36ns ± 2%  -27.58%
IndexByte/2       3.06ns ± 4%     2.34ns ± 1%  -23.42%
IndexByte/3       3.77ns ±11%     2.48ns ± 7%  -34.03%
IndexByte/4       3.18ns ± 8%     2.33ns ± 1%  -26.69%
IndexByte/5       3.18ns ± 5%     2.34ns ± 4%  -26.26%
IndexByte/6       3.13ns ± 3%     2.35ns ± 1%  -24.97%
IndexByte/7       3.25ns ± 1%     2.33ns ± 1%  -28.22%
IndexByte/8       2.79ns ± 2%     2.36ns ± 1%  -15.32%
IndexByte/9       2.90ns ± 0%     2.34ns ± 2%  -19.36%
IndexByte/10      2.99ns ± 3%     2.31ns ± 1%  -22.70%
IndexByte/11      3.13ns ± 7%     2.31ns ± 0%  -26.08%
IndexByte/12      3.01ns ± 4%     2.32ns ± 1%  -22.91%
IndexByte/13      2.98ns ± 3%     2.31ns ± 1%  -22.72%
IndexByte/14      2.92ns ± 2%     2.61ns ±16%  -10.58%
IndexByte/15      3.02ns ± 5%     2.69ns ± 7%  -10.90%
IndexByte/16      2.65ns ±15%     2.29ns ± 1%  -13.61%

Change-Id: I4482f762d25eabf60def4981a0b2bc0c10ccf50c
Reviewed-on: https://go-review.googlesource.com/c/go/+/478656
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Paul Murphy <murp@ibm.com>
Reviewed-by: Archana Ravindar <aravind5@in.ibm.com>
TryBot-Result: Gopher Robot <gobot@golang.org>

src/internal/bytealg/indexbyte_ppc64x.s

index 1a6e852d67258f107270cbce9bbb0c7bff4e841f..b6714f45aae3cab49b0caecb1723a0d72377ce6e 100644 (file)
@@ -11,381 +11,304 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
        // R3 = byte array pointer
        // R4 = length
        MOVD    R6, R5          // R5 = byte
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
        BR      indexbytebody<>(SB)
 
 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
        // R3 = string
        // R4 = length
        // R5 = byte
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
        BR      indexbytebody<>(SB)
 
+#ifndef GOPPC64_power9
+#ifdef GOARCH_ppc64le
+DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
+DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
+#else
+DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
+DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
+#endif
+GLOBL indexbytevbperm<>+0(SB), RODATA, $16
+#endif
+
+// Some operations are endian specific, choose the correct opcode base on GOARCH.
+// Note, _VCZBEBB is only available on power9 and newer.
+#ifdef GOARCH_ppc64le
+#define _LDBEX MOVDBR
+#define _LWBEX MOVWBR
+#define _LHBEX MOVHBR
+#define _VCZBEBB VCTZLSBB
+#else
+#define _LDBEX MOVD
+#define _LWBEX MOVW
+#define _LHBEX MOVH
+#define _VCZBEBB VCLZLSBB
+#endif
+
 // R3 = addr of string
 // R4 = len of string
 // R5 = byte to find
-// R16 = 1 if running on a POWER9 system, 0 otherwise
 // On exit:
 // R3 = return value
 TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
-       MOVD    R3,R17          // Save base address for calculating the index later.
-       RLDICR  $0,R3,$60,R8    // Align address to doubleword boundary in R8.
-       RLDIMI  $8,R5,$48,R5    // Replicating the byte across the register.
-       ADD     R4,R3,R7        // Last acceptable address in R7.
+       CMPU    R4,$32
 
-       RLDIMI  $16,R5,$32,R5
-       CMPU    R4,$32          // Check if it's a small string (≤32 bytes). Those will be processed differently.
-       MOVD    $-1,R9
-       RLWNM   $3,R3,$26,$28,R6        // shift amount for mask (r3&0x7)*8
-       RLDIMI  $32,R5,$0,R5
-       MOVD    R7,R10          // Save last acceptable address in R10 for later.
-       ADD     $-1,R7,R7
-#ifdef GOARCH_ppc64le
-       SLD     R6,R9,R9        // Prepare mask for Little Endian
-#else
-       SRD     R6,R9,R9        // Same for Big Endian
+#ifndef GOPPC64_power9
+       // Load VBPERMQ constant to reduce compare into an ordered bit mask.
+       MOVD    $indexbytevbperm<>+00(SB),R16
+       LXVD2X  (R16),V0        // Set up swap string
 #endif
-       BLT     small_string    // Jump to the small string case if it's <32 bytes.
-       CMP     R16,$1          // optimize for power8 v power9
-       BNE     power8
-       VSPLTISB        $3,V10  // Use V10 as control for VBPERMQ
+
        MTVRD   R5,V1
-       LVSL    (R0+R0),V11     // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
-       VSLB    V11,V10,V10     // to extract the first bit of match result into GPR
        VSPLTB  $7,V1,V1        // Replicate byte across V1
-       CMP     R4,$64
+
+       BLT     cmp16           // Jump to the small string case if it's <32 bytes.
+
+       CMP     R4,$64,CR1
        MOVD    $16,R11
        MOVD    R3,R8
-       BLT     cmp32
+       BLT     CR1,cmp32       // Special case for length 32 - 63
        MOVD    $32,R12
        MOVD    $48,R6
 
+       RLDICR  $0,R4,$63-6,R9  // R9 = len &^ 63
+       ADD     R3,R9,R9        // R9 = &s[len &^ 63]
+       ANDCC   $63,R4          // (len &= 63) cmp 0.
+
+       PCALIGN $16
 loop64:
-       LXVB16X (R0)(R8),V2     // scan 64 bytes at a time
+       LXVD2X  (R0)(R8),V2     // Scan 64 bytes at a time, starting at &s[0]
        VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat0    // match found at R8, jump out
+       BNE     CR6,foundat0    // Match found at R8, jump out
 
-       LXVB16X (R8)(R11),V2
+       LXVD2X  (R11)(R8),V2
        VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat1    // match found at R8+16 bytes, jump out
+       BNE     CR6,foundat1    // Match found at R8+16 bytes, jump out
 
-       LXVB16X (R8)(R12),V2
+       LXVD2X  (R12)(R8),V2
        VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat2    // match found at R8+32 bytes, jump out
+       BNE     CR6,foundat2    // Match found at R8+32 bytes, jump out
 
-       LXVB16X (R8)(R6),V2
+       LXVD2X  (R6)(R8),V2
        VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat3    // match found at R8+48 bytes, jump out
+       BNE     CR6,foundat3    // Match found at R8+48 bytes, jump out
+
        ADD     $64,R8
-       ADD     $-64,R4
-       CMP     R4,$64          // >=64 bytes left to scan?
-       BGE     loop64
-       CMP     R4,$32
-       BLT     rem             // jump to rem if there are < 32 bytes left
-cmp32:
-       LXVB16X (R0)(R8),V2     // 32-63 bytes left
+       CMPU    R8,R9,CR1
+       BNE     CR1,loop64      // R8 != &s[len &^ 63]?
+
+       PCALIGN $32
+       BEQ     notfound        // Is tail length 0? CR0 is set before entering loop64.
+
+       CMP     R4,$32          // Tail length >= 32, use cmp32 path.
+       CMP     R4,$16,CR1
+       BGE     cmp32
+
+       ADD     R8,R4,R9
+       ADD     $-16,R9
+       BLE     CR1,cmp64_tail_gt0
+
+cmp64_tail_gt16:       // Tail length 17 - 32
+       LXVD2X  (R0)(R8),V2
        VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat0    // match found at R8
+       BNE     CR6,foundat0
 
-       LXVB16X (R11)(R8),V2
+cmp64_tail_gt0:        // Tail length 1 - 16
+       MOVD    R9,R8
+       LXVD2X  (R0)(R9),V2
        VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat1    // match found at R8+16
+       BNE     CR6,foundat0
 
-       ADD     $32,R8
-       ADD     $-32,R4
-rem:
-       RLDICR  $0,R8,$60,R8    // align address to reuse code for tail end processing
-       BR      small_string
+       BR      notfound
+
+cmp32: // Length 32 - 63
+
+       // Bytes 0 - 15
+       LXVD2X  (R0)(R8),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0
+
+       // Bytes 16 - 31
+       LXVD2X  (R8)(R11),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat1            // Match found at R8+16 bytes, jump out
+
+       BEQ     notfound                // Is length <= 32? (CR0 holds this comparison on entry to cmp32)
+       CMP     R4,$48
+
+       ADD     R4,R8,R9                // Compute &s[len(s)-16]
+       ADD     $32,R8,R8
+       ADD     $-16,R9,R9
+       ISEL    CR0GT,R8,R9,R8          // R8 = len(s) <= 48 ? R9 : R8
+
+       // Bytes 33 - 47
+       LXVD2X  (R0)(R8),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0            // match found at R8+32 bytes, jump out
+
+       BLE     notfound
 
+       // Bytes 48 - 63
+       MOVD    R9,R8                   // R9 holds the final check.
+       LXVD2X  (R0)(R9),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0            // Match found at R8+48 bytes, jump out
+
+       BR      notfound
+
+// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
+#ifndef GOPPC64_power9
+#define ADJUST_FOR_CNTLZW -16
+#else
+#define ADJUST_FOR_CNTLZW 0
+#endif
+
+// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
+// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
 foundat3:
-       ADD     $16,R8
+       SUB     R3,R8,R3
+       ADD     $48+ADJUST_FOR_CNTLZW,R3
+       BR      vfound
 foundat2:
-       ADD     $16,R8
+       SUB     R3,R8,R3
+       ADD     $32+ADJUST_FOR_CNTLZW,R3
+       BR      vfound
 foundat1:
-       ADD     $16,R8
+       SUB     R3,R8,R3
+       ADD     $16+ADJUST_FOR_CNTLZW,R3
+       BR      vfound
 foundat0:
-       // Compress the result into a single doubleword and
-       // move it to a GPR for the final calculation.
-       VBPERMQ V6,V10,V6
-       MFVRD   V6,R3
-       // count leading zeroes upto the match that ends up in low 16 bits
-       // in both endian modes, compute index by subtracting the number by 16
-       CNTLZW  R3,R11
-       ADD     $-16,R11
-       ADD     R8,R11,R3       // Calculate byte address
-       SUB     R17,R3
+       SUB     R3,R8,R3
+       ADD     $0+ADJUST_FOR_CNTLZW,R3
+vfound:
+       // Map equal values into a 16 bit value with earlier matches setting higher bits.
+#ifndef GOPPC64_power9
+       VBPERMQ V6,V0,V6
+       MFVRD   V6,R4
+       CNTLZW  R4,R4
+#else
+#ifdef GOARCH_ppc64le
+       // Put the value back into LE ordering by swapping doublewords.
+       XXPERMDI        V6,V6,$2,V6
+#endif
+       _VCZBEBB        V6,R4
+#endif
+       ADD     R3,R4,R3
        RET
-power8:
-       // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
-       // in V0, V1 and V10, then branch to the preloop.
-       ANDCC   $63,R3,R11
-       BEQ     CR0,qw_align
-       RLDICL  $0,R3,$61,R11
-
-       MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
-       CMPB    R12,R5,R3       // Check for a match.
-       AND     R9,R3,R3        // Mask bytes below s_base
-       RLDICR  $0,R7,$60,R7    // Last doubleword in R7
-       CMPU    R3,$0,CR7       // If we have a match, jump to the final computation
-       BNE     CR7,done
-       ADD     $8,R8,R8
-       ADD     $-8,R4,R4
-       ADD     R4,R11,R4
 
-       // Check for quadword alignment
-       ANDCC   $15,R8,R11
-       BEQ     CR0,qw_align
+cmp16: // Length 16 - 31
+       CMPU    R4,$16
+       ADD     R4,R3,R9
+       BLT     cmp8
 
-       // Not aligned, so handle the next doubleword
-       MOVD    0(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR7
-       BNE     CR7,done
-       ADD     $8,R8,R8
-       ADD     $-8,R4,R4
+       ADD     $-16,R9,R9              // &s[len(s)-16]
 
-       // Either quadword aligned or 64-byte at this point. We can use LVX.
-qw_align:
-
-       // Set up auxiliary data for the vectorized algorithm.
-       VSPLTISB  $0,V0         // Replicate 0 across V0
-       VSPLTISB  $3,V10        // Use V10 as control for VBPERMQ
-       MTVRD     R5,V1
-       LVSL      (R0+R0),V11
-       VSLB      V11,V10,V10
-       VSPLTB    $7,V1,V1      // Replicate byte across V1
-       CMPU      R4, $64       // If len ≤ 64, don't use the vectorized loop
-       BLE       tail
-
-       // We will load 4 quardwords per iteration in the loop, so check for
-       // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
-       ANDCC     $63,R8,R11
-       BEQ       CR0,preloop
-
-       // Not 64-byte aligned. Load one quadword at a time until aligned.
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       ADD         $-16,R4,R4
-
-       ANDCC       $63,R8,R11
-       BEQ         CR0,preloop
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       ADD         $-16,R4,R4
-
-       ANDCC       $63,R8,R11
-       BEQ         CR0,preloop
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
-       BNE         CR6,found_qw_align
-       ADD         $-16,R4,R4
-       ADD         $16,R8,R8
-
-       // 64-byte aligned. Prepare for the main loop.
-preloop:
-       CMPU    R4,$64
-       BLE     tail          // If len ≤ 64, don't use the vectorized loop
-
-       // We are now aligned to a 64-byte boundary. We will load 4 quadwords
-       // per loop iteration. The last doubleword is in R10, so our loop counter
-       // starts at (R10-R8)/64.
-       SUB     R8,R10,R6
-       SRD     $6,R6,R9      // Loop counter in R9
-       MOVD    R9,CTR
-
-       ADD     $-64,R8,R8   // Adjust index for loop entry
-       MOVD    $16,R11      // Load offsets for the vector loads
-       MOVD    $32,R9
-       MOVD    $48,R7
-
-       // Main loop we will load 64 bytes per iteration
-loop:
-       ADD         $64,R8,R8         // Fuse addi+lvx for performance
-       LVX         (R8+R0),V2        // Load 4 16-byte vectors
-       LVX         (R8+R11),V3
-       VCMPEQUB    V1,V2,V6          // Look for byte in each vector
-       VCMPEQUB    V1,V3,V7
-
-       LVX         (R8+R9),V4
-       LVX         (R8+R7),V5
-       VCMPEQUB    V1,V4,V8
-       VCMPEQUB    V1,V5,V9
-
-       VOR         V6,V7,V11         // Compress the result in a single vector
-       VOR         V8,V9,V12
-       VOR         V11,V12,V13
-       VCMPEQUBCC  V0,V13,V14        // Check for byte
-       BGE         CR6,found
-       BC          16,0,loop         // bdnz loop
-
-       // Handle the tailing bytes or R4 ≤ 64
-       RLDICL  $0,R6,$58,R4
-       ADD     $64,R8,R8
-tail:
-       CMPU        R4,$0
-       BEQ         notfound
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       CMPU        R4,$16,CR6
-       BLE         CR6,notfound
-       ADD         $-16,R4,R4
-
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       CMPU        R4,$16,CR6
-       BLE         CR6,notfound
-       ADD         $-16,R4,R4
-
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       CMPU        R4,$16,CR6
-       BLE         CR6,notfound
-       ADD         $-16,R4,R4
-
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
+       // Bytes 0 - 15
+       LXVD2X  (R0)(R3),V2
+       VCMPEQUBCC      V2,V1,V6
+       MOVD    R3,R8
+       BNE     CR6,foundat0            // Match found at R8+32 bytes, jump out
 
-notfound:
-       MOVD    $-1, R3
-       RET
+       BEQ     notfound
 
-found:
-       // We will now compress the results into a single doubleword,
-       // so it can be moved to a GPR for the final index calculation.
-
-       // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
-       // first bit of each byte into bits 48-63.
-       VBPERMQ   V6,V10,V6
-       VBPERMQ   V7,V10,V7
-       VBPERMQ   V8,V10,V8
-       VBPERMQ   V9,V10,V9
-
-       // Shift each 16-bit component into its correct position for
-       // merging into a single doubleword.
-#ifdef GOARCH_ppc64le
-       VSLDOI    $2,V7,V7,V7
-       VSLDOI    $4,V8,V8,V8
-       VSLDOI    $6,V9,V9,V9
-#else
-       VSLDOI    $6,V6,V6,V6
-       VSLDOI    $4,V7,V7,V7
-       VSLDOI    $2,V8,V8,V8
-#endif
+       // Bytes 16 - 30
+       MOVD    R9,R8                   // R9 holds the final check.
+       LXVD2X  (R0)(R9),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0            // Match found at R8+48 bytes, jump out
+
+       BR      notfound
 
-       // Merge V6-V9 into a single doubleword and move to a GPR.
-       VOR     V6,V7,V11
-       VOR     V8,V9,V4
-       VOR     V4,V11,V4
-       MFVRD   V4,R3
 
-#ifdef GOARCH_ppc64le
-       ADD       $-1,R3,R11
-       ANDN      R3,R11,R11
-       POPCNTD   R11,R11       // Count trailing zeros (Little Endian).
+cmp8:  // Length 8 - 15
+#ifdef GOPPC64_power10
+       // Load all the bytes into a single VSR in BE order.
+       SLD     $56,R4,R5
+       LXVLL   R3,R5,V2
+       // Compare and count the number which don't match.
+       VCMPEQUB        V2,V1,V6
+       VCLZLSBB        V6,R3
+       // If count is the number of bytes, or more. No matches are found.
+       CMPU    R3,R4
+       MOVD    $-1,R5
+       // Otherwise, the count is the index of the first match.
+       ISEL    CR0LT,R3,R5,R3
+       RET
 #else
-       CNTLZD  R3,R11          // Count leading zeros (Big Endian).
-#endif
-       ADD     R8,R11,R3       // Calculate byte address
+       RLDIMI  $8,R5,$48,R5    // Replicating the byte across the register.
+       RLDIMI  $16,R5,$32,R5
+       RLDIMI  $32,R5,$0,R5
+       CMPU    R4,$8
+       BLT     cmp4
+       MOVD    $-8,R11
+       ADD     $-8,R4,R4
 
-return:
-       SUB     R17, R3
+       _LDBEX  (R0)(R3),R10
+       _LDBEX  (R11)(R9),R11
+       CMPB    R10,R5,R10
+       CMPB    R11,R5,R11
+       CMPU    R10,$0
+       CMPU    R11,$0,CR1
+       CNTLZD  R10,R10
+       CNTLZD  R11,R11
+       SRD     $3,R10,R3
+       SRD     $3,R11,R11
+       BNE     found
+
+       ADD     R4,R11,R4
+       MOVD    $-1,R3
+       ISEL    CR1EQ,R3,R4,R3
        RET
 
-found_qw_align:
-       // Use the same algorithm as above. Compress the result into
-       // a single doubleword and move it to a GPR for the final
-       // calculation.
-       VBPERMQ   V6,V10,V6
+cmp4:  // Length 4 - 7
+       CMPU    R4,$4
+       BLT     cmp2
+       MOVD    $-4,R11
+       ADD     $-4,R4,R4
+
+       _LWBEX  (R0)(R3),R10
+       _LWBEX  (R11)(R9),R11
+       CMPB    R10,R5,R10
+       CMPB    R11,R5,R11
+       CNTLZW  R10,R10
+       CNTLZW  R11,R11
+       CMPU    R10,$32
+       CMPU    R11,$32,CR1
+       SRD     $3,R10,R3
+       SRD     $3,R11,R11
+       BNE     found
 
-#ifdef GOARCH_ppc64le
-       MFVRD     V6,R3
-       ADD       $-1,R3,R11
-       ANDN      R3,R11,R11
-       POPCNTD   R11,R11
-#else
-       VSLDOI    $6,V6,V6,V6
-       MFVRD     V6,R3
-       CNTLZD    R3,R11
-#endif
-       ADD       R8,R11,R3
-       CMPU      R11,R4
-       BLT       return
-       BR        notfound
-       PCALIGN   $16
-
-done:
-       ADD     $-1,R10,R6
-       // Offset of last index for the final
-       // doubleword comparison
-       RLDICL  $0,R6,$61,R6
-       // At this point, R3 has 0xFF in the same position as the byte we are
-       // looking for in the doubleword. Use that to calculate the exact index
-       // of the byte.
-#ifdef GOARCH_ppc64le
-       ADD     $-1,R3,R11
-       ANDN    R3,R11,R11
-       POPCNTD R11,R11         // Count trailing zeros (Little Endian).
-#else
-       CNTLZD  R3,R11          // Count leading zeros (Big Endian).
-#endif
-       CMPU    R8,R7           // Check if we are at the last doubleword.
-       SRD     $3,R11          // Convert trailing zeros to bytes.
-       ADD     R11,R8,R3
-       CMPU    R11,R6,CR7      // If at the last doubleword, check the byte offset.
-       BNE     return
-       BLE     CR7,return
-       BR      notfound
+       ADD     R4,R11,R4
+       MOVD    $-1,R3
+       ISEL    CR1EQ,R3,R4,R3
+       RET
 
-small_string:
-       // process string of length < 32 bytes
-       // We unroll this loop for better performance.
-       CMPU    R4,$0           // Check for length=0
-       BEQ     notfound
+cmp2:  // Length 2 - 3
+       CMPU    R4,$2
+       BLT     cmp1
 
-       MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
-       CMPB    R12,R5,R3       // Check for a match.
-       AND     R9,R3,R3        // Mask bytes below s_base.
-       CMPU    R3,$0,CR7       // If we have a match, jump to the final computation.
-       RLDICR  $0,R7,$60,R7    // Last doubleword in R7.
-       CMPU    R8,R7
-       BNE     CR7,done
-       BEQ     notfound        // Hit length.
-
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       CMPU    R8,R7
-       BNE     CR6,done
-       BEQ     notfound
+       _LHBEX  (R0)(R3),R10
+       CMPB    R10,R5,R10
+       SLDCC   $48,R10,R10
+       CNTLZD  R10,R10
+       SRD     $3,R10,R3
+       BNE     found
 
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       CMPU    R8,R7
-       BNE     CR6,done
-       BEQ     notfound
+cmp1:  // Length 1
+       MOVD    $-1,R3
+       ANDCC   $1,R4,R31
+       BEQ     found
 
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       CMPU    R8,R7
-       BNE     CR6,done
-       BEQ     notfound
+       MOVBZ   -1(R9),R10
+       CMPB    R10,R5,R10
+       ANDCC   $1,R10
+       ADD     $-1,R4
+       ISEL    CR0EQ,R3,R4,R3
 
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       BNE     CR6,done
-       BR      notfound
+found:
+       RET
+#endif
+
+notfound:
+       MOVD $-1,R3
+       RET