internal/bytealg: rewrite indexbytebody on PPC64

author Paul E. Murphy <murp@ibm.com>

Mon, 6 Mar 2023 22:51:31 +0000 (16:51 -0600)

committer Paul Murphy <murp@ibm.com>

Fri, 21 Apr 2023 16:10:29 +0000 (16:10 +0000)
author Paul E. Murphy <murp@ibm.com>
Mon, 6 Mar 2023 22:51:31 +0000 (16:51 -0600)
committer Paul Murphy <murp@ibm.com>
Fri, 21 Apr 2023 16:10:29 +0000 (16:10 +0000)
diff --git a/src/internal/bytealg/indexbyte_ppc64x.s b/src/internal/bytealg/indexbyte_ppc64x.s

index 1a6e852d67258f107270cbce9bbb0c7bff4e841f..b6714f45aae3cab49b0caecb1723a0d72377ce6e 100644 (file)
--- a/src/internal/bytealg/indexbyte_ppc64x.s
+++ b/src/internal/bytealg/indexbyte_ppc64x.s
@@ -11,381 +11,304 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
         // R3 = byte array pointer
         // R4 = length
         MOVD    R6, R5          // R5 = byte
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
         BR      indexbytebody<>(SB)
  
  TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
         // R3 = string
         // R4 = length
         // R5 = byte
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
         BR      indexbytebody<>(SB)
  
+#ifndef GOPPC64_power9
+#ifdef GOARCH_ppc64le
+DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
+DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
+#else
+DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
+DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
+#endif
+GLOBL indexbytevbperm<>+0(SB), RODATA, $16
+#endif
+
+// Some operations are endian specific, choose the correct opcode base on GOARCH.
+// Note, _VCZBEBB is only available on power9 and newer.
+#ifdef GOARCH_ppc64le
+#define _LDBEX MOVDBR
+#define _LWBEX MOVWBR
+#define _LHBEX MOVHBR
+#define _VCZBEBB VCTZLSBB
+#else
+#define _LDBEX MOVD
+#define _LWBEX MOVW
+#define _LHBEX MOVH
+#define _VCZBEBB VCLZLSBB
+#endif
+
  // R3 = addr of string
  // R4 = len of string
  // R5 = byte to find
-// R16 = 1 if running on a POWER9 system, 0 otherwise
  // On exit:
  // R3 = return value
  TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
-       MOVD    R3,R17          // Save base address for calculating the index later.
-       RLDICR  $0,R3,$60,R8    // Align address to doubleword boundary in R8.
-       RLDIMI  $8,R5,$48,R5    // Replicating the byte across the register.
-       ADD     R4,R3,R7        // Last acceptable address in R7.
+       CMPU    R4,$32
  
-       RLDIMI  $16,R5,$32,R5
-       CMPU    R4,$32          // Check if it's a small string (≤32 bytes). Those will be processed differently.
-       MOVD    $-1,R9
-       RLWNM   $3,R3,$26,$28,R6        // shift amount for mask (r3&0x7)*8
-       RLDIMI  $32,R5,$0,R5
-       MOVD    R7,R10          // Save last acceptable address in R10 for later.
-       ADD     $-1,R7,R7
-#ifdef GOARCH_ppc64le
-       SLD     R6,R9,R9        // Prepare mask for Little Endian
-#else
-       SRD     R6,R9,R9        // Same for Big Endian
+#ifndef GOPPC64_power9
+       // Load VBPERMQ constant to reduce compare into an ordered bit mask.
+       MOVD    $indexbytevbperm<>+00(SB),R16
+       LXVD2X  (R16),V0        // Set up swap string
  #endif
-       BLT     small_string    // Jump to the small string case if it's <32 bytes.
-       CMP     R16,$1          // optimize for power8 v power9
-       BNE     power8
-       VSPLTISB        $3,V10  // Use V10 as control for VBPERMQ
+
         MTVRD   R5,V1
-       LVSL    (R0+R0),V11     // set up the permute vector such that V10 has {0x78, .., 0x8, 0x0}
-       VSLB    V11,V10,V10     // to extract the first bit of match result into GPR
         VSPLTB  $7,V1,V1        // Replicate byte across V1
-       CMP     R4,$64
+
+       BLT     cmp16           // Jump to the small string case if it's <32 bytes.
+
+       CMP     R4,$64,CR1
         MOVD    $16,R11
         MOVD    R3,R8
-       BLT     cmp32
+       BLT     CR1,cmp32       // Special case for length 32 - 63
         MOVD    $32,R12
         MOVD    $48,R6
  
+       RLDICR  $0,R4,$63-6,R9  // R9 = len &^ 63
+       ADD     R3,R9,R9        // R9 = &s[len &^ 63]
+       ANDCC   $63,R4          // (len &= 63) cmp 0.
+
+       PCALIGN $16
  loop64:
-       LXVB16X (R0)(R8),V2     // scan 64 bytes at a time
+       LXVD2X  (R0)(R8),V2     // Scan 64 bytes at a time, starting at &s[0]
         VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat0    // match found at R8, jump out
+       BNE     CR6,foundat0    // Match found at R8, jump out
  
-       LXVB16X (R8)(R11),V2
+       LXVD2X  (R11)(R8),V2
         VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat1    // match found at R8+16 bytes, jump out
+       BNE     CR6,foundat1    // Match found at R8+16 bytes, jump out
  
-       LXVB16X (R8)(R12),V2
+       LXVD2X  (R12)(R8),V2
         VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat2    // match found at R8+32 bytes, jump out
+       BNE     CR6,foundat2    // Match found at R8+32 bytes, jump out
  
-       LXVB16X (R8)(R6),V2
+       LXVD2X  (R6)(R8),V2
         VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat3    // match found at R8+48 bytes, jump out
+       BNE     CR6,foundat3    // Match found at R8+48 bytes, jump out
+
         ADD     $64,R8
-       ADD     $-64,R4
-       CMP     R4,$64          // >=64 bytes left to scan?
-       BGE     loop64
-       CMP     R4,$32
-       BLT     rem             // jump to rem if there are < 32 bytes left
-cmp32:
-       LXVB16X (R0)(R8),V2     // 32-63 bytes left
+       CMPU    R8,R9,CR1
+       BNE     CR1,loop64      // R8 != &s[len &^ 63]?
+
+       PCALIGN $32
+       BEQ     notfound        // Is tail length 0? CR0 is set before entering loop64.
+
+       CMP     R4,$32          // Tail length >= 32, use cmp32 path.
+       CMP     R4,$16,CR1
+       BGE     cmp32
+
+       ADD     R8,R4,R9
+       ADD     $-16,R9
+       BLE     CR1,cmp64_tail_gt0
+
+cmp64_tail_gt16:       // Tail length 17 - 32
+       LXVD2X  (R0)(R8),V2
         VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat0    // match found at R8
+       BNE     CR6,foundat0
  
-       LXVB16X (R11)(R8),V2
+cmp64_tail_gt0:        // Tail length 1 - 16
+       MOVD    R9,R8
+       LXVD2X  (R0)(R9),V2
         VCMPEQUBCC      V2,V1,V6
-       BNE     CR6,foundat1    // match found at R8+16
+       BNE     CR6,foundat0
  
-       ADD     $32,R8
-       ADD     $-32,R4
-rem:
-       RLDICR  $0,R8,$60,R8    // align address to reuse code for tail end processing
-       BR      small_string
+       BR      notfound
+
+cmp32: // Length 32 - 63
+
+       // Bytes 0 - 15
+       LXVD2X  (R0)(R8),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0
+
+       // Bytes 16 - 31
+       LXVD2X  (R8)(R11),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat1            // Match found at R8+16 bytes, jump out
+
+       BEQ     notfound                // Is length <= 32? (CR0 holds this comparison on entry to cmp32)
+       CMP     R4,$48
+
+       ADD     R4,R8,R9                // Compute &s[len(s)-16]
+       ADD     $32,R8,R8
+       ADD     $-16,R9,R9
+       ISEL    CR0GT,R8,R9,R8          // R8 = len(s) <= 48 ? R9 : R8
+
+       // Bytes 33 - 47
+       LXVD2X  (R0)(R8),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0            // match found at R8+32 bytes, jump out
+
+       BLE     notfound
  
+       // Bytes 48 - 63
+       MOVD    R9,R8                   // R9 holds the final check.
+       LXVD2X  (R0)(R9),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0            // Match found at R8+48 bytes, jump out
+
+       BR      notfound
+
+// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
+#ifndef GOPPC64_power9
+#define ADJUST_FOR_CNTLZW -16
+#else
+#define ADJUST_FOR_CNTLZW 0
+#endif
+
+// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
+// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
  foundat3:
-       ADD     $16,R8
+       SUB     R3,R8,R3
+       ADD     $48+ADJUST_FOR_CNTLZW,R3
+       BR      vfound
  foundat2:
-       ADD     $16,R8
+       SUB     R3,R8,R3
+       ADD     $32+ADJUST_FOR_CNTLZW,R3
+       BR      vfound
  foundat1:
-       ADD     $16,R8
+       SUB     R3,R8,R3
+       ADD     $16+ADJUST_FOR_CNTLZW,R3
+       BR      vfound
  foundat0:
-       // Compress the result into a single doubleword and
-       // move it to a GPR for the final calculation.
-       VBPERMQ V6,V10,V6
-       MFVRD   V6,R3
-       // count leading zeroes upto the match that ends up in low 16 bits
-       // in both endian modes, compute index by subtracting the number by 16
-       CNTLZW  R3,R11
-       ADD     $-16,R11
-       ADD     R8,R11,R3       // Calculate byte address
-       SUB     R17,R3
+       SUB     R3,R8,R3
+       ADD     $0+ADJUST_FOR_CNTLZW,R3
+vfound:
+       // Map equal values into a 16 bit value with earlier matches setting higher bits.
+#ifndef GOPPC64_power9
+       VBPERMQ V6,V0,V6
+       MFVRD   V6,R4
+       CNTLZW  R4,R4
+#else
+#ifdef GOARCH_ppc64le
+       // Put the value back into LE ordering by swapping doublewords.
+       XXPERMDI        V6,V6,$2,V6
+#endif
+       _VCZBEBB        V6,R4
+#endif
+       ADD     R3,R4,R3
         RET
-power8:
-       // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
-       // in V0, V1 and V10, then branch to the preloop.
-       ANDCC   $63,R3,R11
-       BEQ     CR0,qw_align
-       RLDICL  $0,R3,$61,R11
-
-       MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
-       CMPB    R12,R5,R3       // Check for a match.
-       AND     R9,R3,R3        // Mask bytes below s_base
-       RLDICR  $0,R7,$60,R7    // Last doubleword in R7
-       CMPU    R3,$0,CR7       // If we have a match, jump to the final computation
-       BNE     CR7,done
-       ADD     $8,R8,R8
-       ADD     $-8,R4,R4
-       ADD     R4,R11,R4
  
-       // Check for quadword alignment
-       ANDCC   $15,R8,R11
-       BEQ     CR0,qw_align
+cmp16: // Length 16 - 31
+       CMPU    R4,$16
+       ADD     R4,R3,R9
+       BLT     cmp8
  
-       // Not aligned, so handle the next doubleword
-       MOVD    0(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR7
-       BNE     CR7,done
-       ADD     $8,R8,R8
-       ADD     $-8,R4,R4
+       ADD     $-16,R9,R9              // &s[len(s)-16]
  
-       // Either quadword aligned or 64-byte at this point. We can use LVX.
-qw_align:
-
-       // Set up auxiliary data for the vectorized algorithm.
-       VSPLTISB  $0,V0         // Replicate 0 across V0
-       VSPLTISB  $3,V10        // Use V10 as control for VBPERMQ
-       MTVRD     R5,V1
-       LVSL      (R0+R0),V11
-       VSLB      V11,V10,V10
-       VSPLTB    $7,V1,V1      // Replicate byte across V1
-       CMPU      R4, $64       // If len ≤ 64, don't use the vectorized loop
-       BLE       tail
-
-       // We will load 4 quardwords per iteration in the loop, so check for
-       // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
-       ANDCC     $63,R8,R11
-       BEQ       CR0,preloop
-
-       // Not 64-byte aligned. Load one quadword at a time until aligned.
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       ADD         $-16,R4,R4
-
-       ANDCC       $63,R8,R11
-       BEQ         CR0,preloop
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       ADD         $-16,R4,R4
-
-       ANDCC       $63,R8,R11
-       BEQ         CR0,preloop
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
-       BNE         CR6,found_qw_align
-       ADD         $-16,R4,R4
-       ADD         $16,R8,R8
-
-       // 64-byte aligned. Prepare for the main loop.
-preloop:
-       CMPU    R4,$64
-       BLE     tail          // If len ≤ 64, don't use the vectorized loop
-
-       // We are now aligned to a 64-byte boundary. We will load 4 quadwords
-       // per loop iteration. The last doubleword is in R10, so our loop counter
-       // starts at (R10-R8)/64.
-       SUB     R8,R10,R6
-       SRD     $6,R6,R9      // Loop counter in R9
-       MOVD    R9,CTR
-
-       ADD     $-64,R8,R8   // Adjust index for loop entry
-       MOVD    $16,R11      // Load offsets for the vector loads
-       MOVD    $32,R9
-       MOVD    $48,R7
-
-       // Main loop we will load 64 bytes per iteration
-loop:
-       ADD         $64,R8,R8         // Fuse addi+lvx for performance
-       LVX         (R8+R0),V2        // Load 4 16-byte vectors
-       LVX         (R8+R11),V3
-       VCMPEQUB    V1,V2,V6          // Look for byte in each vector
-       VCMPEQUB    V1,V3,V7
-
-       LVX         (R8+R9),V4
-       LVX         (R8+R7),V5
-       VCMPEQUB    V1,V4,V8
-       VCMPEQUB    V1,V5,V9
-
-       VOR         V6,V7,V11         // Compress the result in a single vector
-       VOR         V8,V9,V12
-       VOR         V11,V12,V13
-       VCMPEQUBCC  V0,V13,V14        // Check for byte
-       BGE         CR6,found
-       BC          16,0,loop         // bdnz loop
-
-       // Handle the tailing bytes or R4 ≤ 64
-       RLDICL  $0,R6,$58,R4
-       ADD     $64,R8,R8
-tail:
-       CMPU        R4,$0
-       BEQ         notfound
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       CMPU        R4,$16,CR6
-       BLE         CR6,notfound
-       ADD         $-16,R4,R4
-
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       CMPU        R4,$16,CR6
-       BLE         CR6,notfound
-       ADD         $-16,R4,R4
-
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
-       ADD         $16,R8,R8
-       CMPU        R4,$16,CR6
-       BLE         CR6,notfound
-       ADD         $-16,R4,R4
-
-       LVX         (R8+R0),V4
-       VCMPEQUBCC  V1,V4,V6
-       BNE         CR6,found_qw_align
+       // Bytes 0 - 15
+       LXVD2X  (R0)(R3),V2
+       VCMPEQUBCC      V2,V1,V6
+       MOVD    R3,R8
+       BNE     CR6,foundat0            // Match found at R8+32 bytes, jump out
  
-notfound:
-       MOVD    $-1, R3
-       RET
+       BEQ     notfound
  
-found:
-       // We will now compress the results into a single doubleword,
-       // so it can be moved to a GPR for the final index calculation.
-
-       // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
-       // first bit of each byte into bits 48-63.
-       VBPERMQ   V6,V10,V6
-       VBPERMQ   V7,V10,V7
-       VBPERMQ   V8,V10,V8
-       VBPERMQ   V9,V10,V9
-
-       // Shift each 16-bit component into its correct position for
-       // merging into a single doubleword.
-#ifdef GOARCH_ppc64le
-       VSLDOI    $2,V7,V7,V7
-       VSLDOI    $4,V8,V8,V8
-       VSLDOI    $6,V9,V9,V9
-#else
-       VSLDOI    $6,V6,V6,V6
-       VSLDOI    $4,V7,V7,V7
-       VSLDOI    $2,V8,V8,V8
-#endif
+       // Bytes 16 - 30
+       MOVD    R9,R8                   // R9 holds the final check.
+       LXVD2X  (R0)(R9),V2
+       VCMPEQUBCC      V2,V1,V6
+       BNE     CR6,foundat0            // Match found at R8+48 bytes, jump out
+
+       BR      notfound
  
-       // Merge V6-V9 into a single doubleword and move to a GPR.
-       VOR     V6,V7,V11
-       VOR     V8,V9,V4
-       VOR     V4,V11,V4
-       MFVRD   V4,R3
  
-#ifdef GOARCH_ppc64le
-       ADD       $-1,R3,R11
-       ANDN      R3,R11,R11
-       POPCNTD   R11,R11       // Count trailing zeros (Little Endian).
+cmp8:  // Length 8 - 15
+#ifdef GOPPC64_power10
+       // Load all the bytes into a single VSR in BE order.
+       SLD     $56,R4,R5
+       LXVLL   R3,R5,V2
+       // Compare and count the number which don't match.
+       VCMPEQUB        V2,V1,V6
+       VCLZLSBB        V6,R3
+       // If count is the number of bytes, or more. No matches are found.
+       CMPU    R3,R4
+       MOVD    $-1,R5
+       // Otherwise, the count is the index of the first match.
+       ISEL    CR0LT,R3,R5,R3
+       RET
  #else
-       CNTLZD  R3,R11          // Count leading zeros (Big Endian).
-#endif
-       ADD     R8,R11,R3       // Calculate byte address
+       RLDIMI  $8,R5,$48,R5    // Replicating the byte across the register.
+       RLDIMI  $16,R5,$32,R5
+       RLDIMI  $32,R5,$0,R5
+       CMPU    R4,$8
+       BLT     cmp4
+       MOVD    $-8,R11
+       ADD     $-8,R4,R4
  
-return:
-       SUB     R17, R3
+       _LDBEX  (R0)(R3),R10
+       _LDBEX  (R11)(R9),R11
+       CMPB    R10,R5,R10
+       CMPB    R11,R5,R11
+       CMPU    R10,$0
+       CMPU    R11,$0,CR1
+       CNTLZD  R10,R10
+       CNTLZD  R11,R11
+       SRD     $3,R10,R3
+       SRD     $3,R11,R11
+       BNE     found
+
+       ADD     R4,R11,R4
+       MOVD    $-1,R3
+       ISEL    CR1EQ,R3,R4,R3
         RET
  
-found_qw_align:
-       // Use the same algorithm as above. Compress the result into
-       // a single doubleword and move it to a GPR for the final
-       // calculation.
-       VBPERMQ   V6,V10,V6
+cmp4:  // Length 4 - 7
+       CMPU    R4,$4
+       BLT     cmp2
+       MOVD    $-4,R11
+       ADD     $-4,R4,R4
+
+       _LWBEX  (R0)(R3),R10
+       _LWBEX  (R11)(R9),R11
+       CMPB    R10,R5,R10
+       CMPB    R11,R5,R11
+       CNTLZW  R10,R10
+       CNTLZW  R11,R11
+       CMPU    R10,$32
+       CMPU    R11,$32,CR1
+       SRD     $3,R10,R3
+       SRD     $3,R11,R11
+       BNE     found
  
-#ifdef GOARCH_ppc64le
-       MFVRD     V6,R3
-       ADD       $-1,R3,R11
-       ANDN      R3,R11,R11
-       POPCNTD   R11,R11
-#else
-       VSLDOI    $6,V6,V6,V6
-       MFVRD     V6,R3
-       CNTLZD    R3,R11
-#endif
-       ADD       R8,R11,R3
-       CMPU      R11,R4
-       BLT       return
-       BR        notfound
-       PCALIGN   $16
-
-done:
-       ADD     $-1,R10,R6
-       // Offset of last index for the final
-       // doubleword comparison
-       RLDICL  $0,R6,$61,R6
-       // At this point, R3 has 0xFF in the same position as the byte we are
-       // looking for in the doubleword. Use that to calculate the exact index
-       // of the byte.
-#ifdef GOARCH_ppc64le
-       ADD     $-1,R3,R11
-       ANDN    R3,R11,R11
-       POPCNTD R11,R11         // Count trailing zeros (Little Endian).
-#else
-       CNTLZD  R3,R11          // Count leading zeros (Big Endian).
-#endif
-       CMPU    R8,R7           // Check if we are at the last doubleword.
-       SRD     $3,R11          // Convert trailing zeros to bytes.
-       ADD     R11,R8,R3
-       CMPU    R11,R6,CR7      // If at the last doubleword, check the byte offset.
-       BNE     return
-       BLE     CR7,return
-       BR      notfound
+       ADD     R4,R11,R4
+       MOVD    $-1,R3
+       ISEL    CR1EQ,R3,R4,R3
+       RET
  
-small_string:
-       // process string of length < 32 bytes
-       // We unroll this loop for better performance.
-       CMPU    R4,$0           // Check for length=0
-       BEQ     notfound
+cmp2:  // Length 2 - 3
+       CMPU    R4,$2
+       BLT     cmp1
  
-       MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
-       CMPB    R12,R5,R3       // Check for a match.
-       AND     R9,R3,R3        // Mask bytes below s_base.
-       CMPU    R3,$0,CR7       // If we have a match, jump to the final computation.
-       RLDICR  $0,R7,$60,R7    // Last doubleword in R7.
-       CMPU    R8,R7
-       BNE     CR7,done
-       BEQ     notfound        // Hit length.
-
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       CMPU    R8,R7
-       BNE     CR6,done
-       BEQ     notfound
+       _LHBEX  (R0)(R3),R10
+       CMPB    R10,R5,R10
+       SLDCC   $48,R10,R10
+       CNTLZD  R10,R10
+       SRD     $3,R10,R3
+       BNE     found
  
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       CMPU    R8,R7
-       BNE     CR6,done
-       BEQ     notfound
+cmp1:  // Length 1
+       MOVD    $-1,R3
+       ANDCC   $1,R4,R31
+       BEQ     found
  
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       CMPU    R8,R7
-       BNE     CR6,done
-       BEQ     notfound
+       MOVBZ   -1(R9),R10
+       CMPB    R10,R5,R10
+       ANDCC   $1,R10
+       ADD     $-1,R4
+       ISEL    CR0EQ,R3,R4,R3
  
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       BNE     CR6,done
-       BR      notfound
+found:
+       RET
+#endif
+
+notfound:
+       MOVD $-1,R3
+       RET
author	Paul E. Murphy <murp@ibm.com>
	Mon, 6 Mar 2023 22:51:31 +0000 (16:51 -0600)
committer	Paul Murphy <murp@ibm.com>
	Fri, 21 Apr 2023 16:10:29 +0000 (16:10 +0000)