internal/bytealg: rewrite PPC64 Compare

author Paul E. Murphy <murp@ibm.com>

Mon, 27 Feb 2023 22:04:50 +0000 (16:04 -0600)

committer Paul Murphy <murp@ibm.com>

Tue, 21 Mar 2023 13:10:36 +0000 (13:10 +0000)
author Paul E. Murphy <murp@ibm.com>
Mon, 27 Feb 2023 22:04:50 +0000 (16:04 -0600)
committer Paul Murphy <murp@ibm.com>
Tue, 21 Mar 2023 13:10:36 +0000 (13:10 +0000)
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s

index cbe0525af55d40e0e22a292d630098e25afa760a..f3f8b4abd167dfb0684480c11725529ffd2b8564 100644 (file)
--- a/src/internal/bytealg/compare_ppc64x.s
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -7,37 +7,62 @@
  #include "go_asm.h"
  #include "textflag.h"
  
+// Helper names for x-form loads in BE ordering.
+#ifdef  GOARCH_ppc64le
+#define _LDBEX MOVDBR
+#define _LWBEX MOVWBR
+#define _LHBEX MOVHBR
+#else
+#define _LDBEX MOVD
+#define _LWBEX MOVW
+#define _LHBEX MOVH
+#endif
+
+#ifdef GOPPC64_power9
+#define SETB_CR0(rout) SETB CR0, rout
+#define SETB_CR1(rout) SETB CR1, rout
+#define SETB_INIT()
+#define SETB_CR0_NE(rout) SETB_CR0(rout)
+#else
+// A helper macro to emulate SETB on P8. This assumes
+// -1 is in R20, and 1 is in R21. crxlt and crxeq must
+// also be the same CR field.
+#define _SETB(crxlt, crxeq, rout) \
+       ISEL    crxeq,R0,R21,rout \
+       ISEL    crxlt,R20,rout,rout
+
+// A special case when it is know the comparison
+// will always be not equal. The result must be -1 or 1.
+#define SETB_CR0_NE(rout) \
+       ISEL    CR0LT,R20,R21,rout
+
+#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
+#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
+#define SETB_INIT() \
+       MOVD    $-1,R20 \
+       MOVD    $1,R21
+#endif
+
  TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
         // incoming:
-       // R3 a addr -> R5
-       // R4 a len  -> R3
-       // R5 a cap unused
-       // R6 b addr -> R6
-       // R7 b len  -> R4
-       // R8 b cap unused
-       MOVD    R3, R5
-       MOVD    R4, R3
-       MOVD    R7, R4
-       CMP     R5,R6,CR7
-       CMP     R3,R4,CR6
-       BEQ     CR7,equal
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
-       CMP     R16,$1
-       BNE     power8
-       BR      cmpbodyp9<>(SB)
-power8:
+       // R3 a addr
+       // R4 a len
+       // R6 b addr
+       // R7 b len
+       //
+       // on entry to cmpbody:
+       // R3 return value if len(a) == len(b)
+       // R5 a addr
+       // R6 b addr
+       // R9 min(len(a),len(b))
+       SETB_INIT()
+       MOVD    R3,R5
+       CMP     R4,R7,CR0
+       CMP     R3,R6,CR7
+       ISEL    CR0LT,R4,R7,R9
+       SETB_CR0(R3)
+       BC      $12,30,LR       // beqlr cr7
         BR      cmpbody<>(SB)
-equal:
-       BEQ     CR6,done
-       MOVD    $1, R8
-       BGT     CR6,greater
-       NEG     R8
-greater:
-       MOVD    R8, R3
-       RET
-done:
-       MOVD    $0, R3
-       RET
  
  TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
         // incoming:
@@ -45,32 +70,21 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
         // R4 a len  -> R3
         // R5 b addr -> R6
         // R6 b len  -> R4
-       MOVD    R6, R7
-       MOVD    R5, R6
-       MOVD    R3, R5
-       MOVD    R4, R3
-       MOVD    R7, R4
-       CMP     R5,R6,CR7
-       CMP     R3,R4,CR6
-       BEQ     CR7,equal
-       MOVBZ   internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
-       CMP     R16,$1
-       BNE     power8
-       BR      cmpbodyp9<>(SB)
-power8:
+       //
+       // on entry to cmpbody:
+       // R3 compare value if compared length is same.
+       // R5 a addr
+       // R6 b addr
+       // R9 min(len(a),len(b))
+       SETB_INIT()
+       CMP     R4,R6,CR0
+       CMP     R3,R5,CR7
+       ISEL    CR0LT,R4,R6,R9
+       MOVD    R5,R6
+       MOVD    R3,R5
+       SETB_CR0(R3)
+       BC      $12,30,LR       // beqlr cr7
         BR      cmpbody<>(SB)
-equal:
-       BEQ     CR6,done
-       MOVD    $1, R8
-       BGT     CR6,greater
-       NEG     R8
-greater:
-       MOVD    R8, R3
-       RET
-
-done:
-       MOVD    $0, R3
-       RET
  
  #ifdef GOARCH_ppc64le
  DATA byteswap<>+0(SB)/8, $0x0706050403020100
@@ -79,32 +93,33 @@ GLOBL byteswap<>+0(SB), RODATA, $16
  #define SWAP V21
  #endif
  
-// Do an efficient memcmp for ppc64le/ppc64/POWER8
-// R3 = a len
-// R4 = b len
-// R5 = a addr
-// R6 = b addr
-// On exit:
-// R3 = return value
  TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
-       MOVD    R3,R8           // set up length
-       CMP     R3,R4,CR2       // unequal?
-       BLT     CR2,setuplen    // BLT CR2
-       MOVD    R4,R8           // use R4 for comparison len
-setuplen:
-       CMP     R8,$32          // optimize >= 32
-       MOVD    R8,R9
-       BLT     setup8a         // optimize < 32
-       MOVD    $16,R10         // set offsets to load into vectors
-       CMP     R8,$64
-       BLT     cmp32           // process size 32-63
-
-       DCBT    (R5)            // optimize >= 64
+start:
+       CMP     R9,$16,CR0
+       CMP     R9,$32,CR1
+       CMP     R9,$64,CR2
+       MOVD    $16,R10
+       BLT     cmp8
+       BLT     CR1,cmp16
+       BLT     CR2,cmp32
+
+cmp64: // >= 64B
+       DCBT    (R5)            // optimize for size>=64
         DCBT    (R6)            // cache hint
+
+       SRD     $6,R9,R14       // There is at least one iteration.
+       MOVD    R14,CTR
+       ANDCC   $63,R9,R9
+       CMP     R9,$16,CR1      // Do setup for tail check early on.
+       CMP     R9,$32,CR2
+       CMP     R9,$48,CR3
+       ADD     $-16,R9,R9
+
         MOVD    $32,R11         // set offsets to load into vector
         MOVD    $48,R12         // set offsets to load into vector
  
-loop64a:// process size 64 and greater
+       PCALIGN $32
+cmp64_loop:
         LXVD2X  (R5)(R0),V3     // load bytes of A at offset 0 into vector
         LXVD2X  (R6)(R0),V4     // load bytes of B at offset 0 into vector
         VCMPEQUDCC      V3,V4,V1
@@ -112,391 +127,206 @@ loop64a:// process size 64 and greater
  
         LXVD2X  (R5)(R10),V3    // load bytes of A at offset 16 into vector
         LXVD2X  (R6)(R10),V4    // load bytes of B at offset 16 into vector
-
         VCMPEQUDCC      V3,V4,V1
         BGE     CR6,different
  
         LXVD2X  (R5)(R11),V3    // load bytes of A at offset 32 into vector
         LXVD2X  (R6)(R11),V4    // load bytes of B at offset 32 into vector
-
         VCMPEQUDCC      V3,V4,V1
         BGE     CR6,different
  
         LXVD2X  (R5)(R12),V3    // load bytes of A at offset 64 into vector
         LXVD2X  (R6)(R12),V4    // load bytes of B at offset 64 into vector
-
         VCMPEQUDCC      V3,V4,V1
         BGE     CR6,different
  
-       ADD     $-64,R9,R9      // reduce remaining size by 64
         ADD     $64,R5,R5       // increment to next 64 bytes of A
         ADD     $64,R6,R6       // increment to next 64 bytes of B
-       CMPU    R9,$64
-       BGE     loop64a         // loop back to loop64a only if there are >= 64 bytes remaining
-       
-       CMPU    R9,$32
-       BGE     cmp32           // loop to cmp32 if there are 32-64 bytes remaining
-       CMPU    R9,$0
-       BNE     rem             // loop to rem if the remainder is not 0
-
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-cmp32:
-       LXVD2X  (R5)(R0),V3     // load bytes of A at offset 0 into vector
-       LXVD2X  (R6)(R0),V4     // load bytes of B at offset 0 into vector
+       BDNZ    cmp64_loop
+       BC      $12,2,LR        // beqlr
+
+       // Finish out tail with minimal overlapped checking.
+       // Note, 0 tail is handled by beqlr above.
+       BLE     CR1,cmp64_tail_gt0
+       BLE     CR2,cmp64_tail_gt16
+       BLE     CR3,cmp64_tail_gt32
+
+cmp64_tail_gt48: // 49 - 63 B
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
  
+       LXVD2X  (R5)(R10),V3
+       LXVD2X  (R6)(R10),V4
         VCMPEQUDCC      V3,V4,V1
         BGE     CR6,different
  
-       LXVD2X  (R5)(R10),V3    // load bytes of A at offset 16 into vector
-       LXVD2X  (R6)(R10),V4    // load bytes of B at offset 16 into vector
+       LXVD2X  (R5)(R11),V3
+       LXVD2X  (R6)(R11),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
  
+       BR cmp64_tail_gt0
+
+       PCALIGN $16
+cmp64_tail_gt32: // 33 - 48B
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
         VCMPEQUDCC      V3,V4,V1
         BGE     CR6,different
  
-       ADD     $-32,R9,R9      // reduce remaining size by 32
-       ADD     $32,R5,R5       // increment to next 32 bytes of A
-       ADD     $32,R6,R6       // increment to next 32 bytes of B
-       CMPU    R9,$0
-       BNE     rem             // loop to rem if the remainder is not 0
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-rem:
-       MOVD    R9,R8
-       ANDCC   $24,R8,R9       // Any 8 byte chunks?
-       BEQ     leftover        // and result is 0
-       BR      setup8a
+       LXVD2X  (R5)(R10),V3
+       LXVD2X  (R6)(R10),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
  
-different:
-#ifdef GOARCH_ppc64le
-       MOVD    $byteswap<>+00(SB), R16
-       LXVD2X  (R16)(R0),SWAP  // Set up swap string
+       BR cmp64_tail_gt0
  
-       VPERM   V3,V3,SWAP,V3
-       VPERM   V4,V4,SWAP,V4
-#endif
-       MFVSRD  VS35,R16        // move upper doublwords of A and B into GPR for comparison
-       MFVSRD  VS36,R10
+       PCALIGN $16
+cmp64_tail_gt16: // 17 - 32B
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
  
-       CMPU    R16,R10
-       BEQ     lower
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
-       RET
-lower:
-       VSLDOI  $8,V3,V3,V3     // move lower doublwords of A and B into GPR for comparison
-       MFVSRD  VS35,R16
-       VSLDOI  $8,V4,V4,V4
-       MFVSRD  VS36,R10
+       BR cmp64_tail_gt0
+
+       PCALIGN $16
+cmp64_tail_gt0: // 1 - 16B
+       LXVD2X  (R5)(R9),V3
+       LXVD2X  (R6)(R9),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
  
-       CMPU    R16,R10
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
         RET
-setup8a:
-       SRADCC  $3,R8,R9        // get the 8 byte count
-       BEQ     leftover        // shifted value is 0
-       CMPU    R8,$8           // optimize 8byte move
-       BEQ     size8
-       CMPU    R8,$16
-       BEQ     size16
-       MOVD    R9,CTR          // loop count for doublewords
-loop8:
-#ifdef  GOARCH_ppc64le
-       MOVDBR  (R5+R0),R16     // doublewords to compare
-       MOVDBR  (R6+R0),R10     // LE compare order
-#else
-       MOVD    (R5+R0),R16     // doublewords to compare
-       MOVD    (R6+R0),R10     // BE compare order
-#endif
-       ADD     $8,R5
-       ADD     $8,R6
-       CMPU    R16,R10         // match?
-       BC      8,2,loop8       // bt ctr <> 0 && cr
-       BGT     greater
-       BLT     less
-leftover:
-       ANDCC   $7,R8,R9        // check for leftover bytes
-       BEQ     zeroremainder
-simplecheck:
-       MOVD    R0,R14
-       CMP     R9,$4           // process 4 bytes
-       BLT     halfword
-#ifdef  GOARCH_ppc64le
-       MOVWBR  (R5)(R14),R10
-       MOVWBR  (R6)(R14),R11
-#else
-       MOVWZ   (R5)(R14),R10
-       MOVWZ   (R6)(R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $-4,R9
-       ADD     $4,R14
-       PCALIGN $16
  
-halfword:
-       CMP     R9,$2           // process 2 bytes
-       BLT     byte
-#ifdef  GOARCH_ppc64le
-       MOVHBR  (R5)(R14),R10
-       MOVHBR  (R6)(R14),R11
-#else
-       MOVHZ   (R5)(R14),R10
-       MOVHZ   (R6)(R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $-2,R9
-       ADD     $2,R14
-       PCALIGN $16
-byte:
-       CMP     R9,$0           // process 1 byte
-       BEQ     skip
-       MOVBZ   (R5)(R14),R10
-       MOVBZ   (R6)(R14),R11
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       PCALIGN $16
-skip:
-       BEQ     CR2,equal
-       BGT     CR2,greater
+       PCALIGN $16
+cmp32: // 32 - 63B
+       ANDCC   $31,R9,R9
  
-less:  MOVD    $-1,R3          // return value if A < B
-       RET
-size16:
-       LXVD2X  (R5)(R0),V3     // load bytes of A at offset 0 into vector
-       LXVD2X  (R6)(R0),V4     // load bytes of B at offset 0 into vector
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
         VCMPEQUDCC      V3,V4,V1
         BGE     CR6,different
-zeroremainder:
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-size8:
-#ifdef  GOARCH_ppc64le
-       MOVDBR  (R5+R0),R16     // doublewords to compare
-       MOVDBR  (R6+R0),R10     // LE compare order
-#else
-       MOVD    (R5+R0),R16     // doublewords to compare
-       MOVD    (R6+R0),R10     // BE compare order
-#endif
-       CMPU    R16,R10         // match?
-       BGT     greater
-       BLT     less
-       BGT     CR2,greater     // 2nd len > 1st len
-       BLT     CR2,less        // 2nd len < 1st len
-equal:
-       MOVD    $0, R3          // return value if A == B
-       RET
-greater:
-       MOVD    $1,R3           // return value if A > B
-       RET
  
-// Do an efficient memcmp for ppc64le/ppc64/POWER9
-// R3 = a len
-// R4 = b len
-// R5 = a addr
-// R6 = b addr
-// On exit:
-// R3 = return value
-TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
-       MOVD    R3,R8           // set up length
-       CMP     R3,R4,CR2       // unequal?
-       BLT     CR2,setuplen    // BLT CR2
-       MOVD    R4,R8           // use R4 for comparison len
-setuplen:
-       CMP     R8,$16          // optimize for size<16
-       MOVD    R8,R9
-       BLT     simplecheck
-       MOVD    $16,R10         // set offsets to load into vectors
-       CMP     R8,$32          // optimize for size 16-31
-       BLT     cmp16
-       CMP     R8,$64
-       BLT     cmp32           // optimize for size 32-63
-       DCBT    (R5)            // optimize for size>=64
-       DCBT    (R6)            // cache hint
+       LXVD2X  (R10)(R5),V3
+       LXVD2X  (R10)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
  
-       MOVD    $32,R11         // set offsets to load into vector
-       MOVD    $48,R12         // set offsets to load into vector
+       BC      $12,2,LR        // beqlr
+       ADD     R9,R10,R10
  
-loop64a:// process size 64 and greater
-       LXVB16X (R0)(R5),V3     // load bytes of A at offset 0 into vector
-       LXVB16X (R0)(R6),V4     // load bytes of B at offset 0 into vector
-       VCMPNEBCC       V3,V4,V1        // record comparison into V1
-       BNE     CR6,different   // jump out if its different
+       LXVD2X  (R9)(R5),V3
+       LXVD2X  (R9)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
  
-       LXVB16X (R10)(R5),V3    // load bytes of A at offset 16 into vector
-       LXVB16X (R10)(R6),V4    // load bytes of B at offset 16 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
+       LXVD2X  (R10)(R5),V3
+       LXVD2X  (R10)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
+       RET
  
-       LXVB16X (R11)(R5),V3    // load bytes of A at offset 32 into vector
-       LXVB16X (R11)(R6),V4    // load bytes of B at offset 32 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
+       PCALIGN $16
+cmp16: // 16 - 31B
+       ANDCC   $15,R9,R9
+       LXVD2X  (R0)(R5),V3
+       LXVD2X  (R0)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
+       BC      $12,2,LR        // beqlr
  
-       LXVB16X (R12)(R5),V3    // load bytes of A at offset 48 into vector
-       LXVB16X (R12)(R6),V4    // load bytes of B at offset 48 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
+       LXVD2X  (R9)(R5),V3
+       LXVD2X  (R9)(R6),V4
+       VCMPEQUDCC      V3,V4,V1
+       BGE     CR6,different
+       RET
  
-       ADD     $-64,R9,R9      // reduce remaining size by 64
-       ADD     $64,R5,R5       // increment to next 64 bytes of A
-       ADD     $64,R6,R6       // increment to next 64 bytes of B
-       CMPU    R9,$64
-       BGE     loop64a         // loop back to loop64a only if there are >= 64 bytes remaining
-
-       CMPU    R9,$32
-       BGE     cmp32           // loop to cmp32 if there are 32-64 bytes remaining
-       CMPU    R9,$16
-       BGE     cmp16           // loop to cmp16 if there are 16-31 bytes left
-       CMPU    R9,$0
-       BNE     simplecheck     // loop to simplecheck for remaining bytes
-
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-cmp32:
-       LXVB16X (R0)(R5),V3     // load bytes of A at offset 0 into vector
-       LXVB16X (R0)(R6),V4     // load bytes of B at offset 0 into vector
-
-       VCMPNEBCC       V3,V4,V1        // record comparison into V1
-       BNE     CR6,different   // jump out if its different
-
-       LXVB16X (R10)(R5),V3    // load bytes of A at offset 16 into vector
-       LXVB16X (R10)(R6),V4    // load bytes of B at offset 16 into vector
-       VCMPNEBCC       V3,V4,V1
-       BNE     CR6,different
-
-       ADD     $-32,R9,R9      // reduce remaining size by 32
-       ADD     $32,R5,R5       // increment to next 32 bytes of A
-       ADD     $32,R6,R6       // increment to next 32 bytes of B
-       CMPU    R9,$16          // loop to cmp16 if there are 16-31 bytes left
-       BGE     cmp16
-       CMPU    R9,$0
-       BNE     simplecheck     // loop to simplecheck for remainder bytes
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if len(A)<len(B)
-       BR      greater         // jump to greater otherwise
+       PCALIGN $16
  different:
+#ifdef GOARCH_ppc64le
+       MOVD    $byteswap<>+00(SB),R16
+       LXVD2X  (R16)(R0),SWAP  // Set up swap string
  
-       MFVSRD  VS35,R16        // move upper doublwords of A and B into GPR for comparison
+       VPERM   V3,V3,SWAP,V3
+       VPERM   V4,V4,SWAP,V4
+#endif
+
+       MFVSRD  VS35,R16        // move upper doublewords of A and B into GPR for comparison
         MFVSRD  VS36,R10
  
         CMPU    R16,R10
         BEQ     lower
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
+       SETB_CR0_NE(R3)
         RET
+
+       PCALIGN $16
  lower:
-       MFVSRLD VS35,R16        // next move lower doublewords of A and B into GPR for comparison
-       MFVSRLD VS36,R10
+       VSLDOI  $8,V3,V3,V3     // move lower doublewords of A and B into GPR for comparison
+       MFVSRD  VS35,R16
+       VSLDOI  $8,V4,V4,V4
+       MFVSRD  VS36,R10
  
         CMPU    R16,R10
-       BGT     greater
-       MOVD    $-1,R3          // return value if A < B
+       SETB_CR0_NE(R3)
         RET
  
-greater:
-       MOVD    $1,R3           // return value if A > B
+       PCALIGN $16
+cmp8:  // 8 - 15B
+       CMP     R9,$8
+       BLT     cmp4
+       ANDCC   $7,R9,R9
+       _LDBEX  (R0)(R5),R10
+       _LDBEX  (R0)(R6),R11
+       _LDBEX  (R9)(R5),R12
+       _LDBEX  (R9)(R6),R14
+       CMPU    R10,R11,CR0
+       SETB_CR0(R5)
+       CMPU    R12,R14,CR1
+       SETB_CR1(R6)
+       CRAND   CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
+       ISEL    CR0EQ,R6,R5,R4
+       ISEL    CR1EQ,R3,R4,R3
         RET
-cmp16:
-       ANDCC   $16,R9,R31
-       BEQ     tail
-
-       LXVB16X (R0)(R5),V3     // load bytes of A at offset 16 into vector
-       LXVB16X (R0)(R6),V4     // load bytes of B at offset 16 into vector
-       VCMPEQUDCC      V3,V4,V1
-       BGE     CR6,different
-
-       ADD     $16,R5
-       ADD     $16,R6
-tail:
-       ANDCC   $15,R9          // Load the last 16 bytes (we know there are at least 32b)
-       BEQ     end
-
-       ADD     R9,R5
-       ADD     R9,R6
-       MOVD    $-16,R10
  
-       LXVB16X (R10)(R5),V3    // load bytes of A at offset 16 into vector
-       LXVB16X (R10)(R6),V4    // load bytes of B at offset 16 into vector
-       VCMPEQUDCC      V3,V4,V1
-       BGE     CR6,different
-end:
-       BEQ     CR2,equal       // remainder is zero, jump to equal if len(A)==len(B)
-       BLT     CR2,less        // jump to less if BLT CR2 that is, len(A)<len(B)
-       BR      greater         // jump to greater otherwise
-simplecheck:
-       MOVD    $0,R14          // process 8 bytes
-       CMP     R9,$8
-       BLT     word
-#ifdef  GOARCH_ppc64le
-       MOVDBR  (R5+R14),R10
-       MOVDBR  (R6+R14),R11
-#else
-       MOVD    (R5+R14),R10
-       MOVD    (R6+R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $8,R14
-       ADD     $-8,R9
         PCALIGN $16
-word:
-       CMP     R9,$4           // process 4 bytes
-       BLT     halfword
-#ifdef  GOARCH_ppc64le
-       MOVWBR  (R5+R14),R10
-       MOVWBR  (R6+R14),R11
-#else
-       MOVWZ   (R5+R14),R10
-       MOVWZ   (R6+R14),R11
-#endif
+cmp4:  // 4 - 7B
+       CMP     R9,$4
+       BLT     cmp2
+       ANDCC   $3,R9,R9
+       _LWBEX  (R0)(R5),R10
+       _LWBEX  (R0)(R6),R11
+       _LWBEX  (R9)(R5),R12
+       _LWBEX  (R9)(R6),R14
+       RLDIMI  $32,R10,$0,R12
+       RLDIMI  $32,R11,$0,R14
+       CMPU    R12,R14
+       BR      cmp0
+
+       PCALIGN $16
+cmp2:  // 2 - 3B
+       CMP     R9,$2
+       BLT     cmp1
+       ANDCC   $1,R9,R9
+       _LHBEX  (R0)(R5),R10
+       _LHBEX  (R0)(R6),R11
+       _LHBEX  (R9)(R5),R12
+       _LHBEX  (R9)(R6),R14
+       RLDIMI  $32,R10,$0,R12
+       RLDIMI  $32,R11,$0,R14
+       CMPU    R12,R14
+       BR      cmp0
+
+       PCALIGN $16
+cmp1:
+       CMP     R9,$0
+       BEQ     cmp0
+       MOVBZ   (R5),R10
+       MOVBZ   (R6),R11
         CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $4,R14
-       ADD     $-4,R9
-       PCALIGN $16
-halfword:
-       CMP     R9,$2           // process 2 bytes
-       BLT     byte
-#ifdef  GOARCH_ppc64le
-       MOVHBR  (R5+R14),R10
-       MOVHBR  (R6+R14),R11
-#else
-       MOVHZ   (R5+R14),R10
-       MOVHZ   (R6+R14),R11
-#endif
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       ADD     $2,R14
-       ADD     $-2,R9
-       PCALIGN $16
-byte:
-       CMP     R9,$0           // process 1 byte
-       BEQ     skip
-       MOVBZ   (R5+R14),R10
-       MOVBZ   (R6+R14),R11
-       CMPU    R10,R11
-       BGT     greater
-       BLT     less
-       PCALIGN $16
-skip:
-       BEQ     CR2,equal
-       BGT     CR2,greater
-less:
-       MOVD    $-1,R3          // return value if A < B
-       RET
-equal:
-       MOVD    $0, R3          // return value if A == B
+cmp0:
+       SETB_CR0(R6)
+       ISEL    CR0EQ,R3,R6,R3
         RET
author	Paul E. Murphy <murp@ibm.com>
	Mon, 27 Feb 2023 22:04:50 +0000 (16:04 -0600)
committer	Paul Murphy <murp@ibm.com>
	Tue, 21 Mar 2023 13:10:36 +0000 (13:10 +0000)