#include "go_asm.h"
#include "textflag.h"
+// Helper names for x-form loads in BE ordering.
+#ifdef GOARCH_ppc64le
+#define _LDBEX MOVDBR
+#define _LWBEX MOVWBR
+#define _LHBEX MOVHBR
+#else
+#define _LDBEX MOVD
+#define _LWBEX MOVW
+#define _LHBEX MOVH
+#endif
+
+#ifdef GOPPC64_power9
+#define SETB_CR0(rout) SETB CR0, rout
+#define SETB_CR1(rout) SETB CR1, rout
+#define SETB_INIT()
+#define SETB_CR0_NE(rout) SETB_CR0(rout)
+#else
+// A helper macro to emulate SETB on P8. This assumes
+// -1 is in R20, and 1 is in R21. crxlt and crxeq must
+// also be the same CR field.
+#define _SETB(crxlt, crxeq, rout) \
+ ISEL crxeq,R0,R21,rout \
+ ISEL crxlt,R20,rout,rout
+
+// A special case when it is know the comparison
+// will always be not equal. The result must be -1 or 1.
+#define SETB_CR0_NE(rout) \
+ ISEL CR0LT,R20,R21,rout
+
+#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
+#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
+#define SETB_INIT() \
+ MOVD $-1,R20 \
+ MOVD $1,R21
+#endif
+
TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
// incoming:
- // R3 a addr -> R5
- // R4 a len -> R3
- // R5 a cap unused
- // R6 b addr -> R6
- // R7 b len -> R4
- // R8 b cap unused
- MOVD R3, R5
- MOVD R4, R3
- MOVD R7, R4
- CMP R5,R6,CR7
- CMP R3,R4,CR6
- BEQ CR7,equal
- MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
- CMP R16,$1
- BNE power8
- BR cmpbodyp9<>(SB)
-power8:
+ // R3 a addr
+ // R4 a len
+ // R6 b addr
+ // R7 b len
+ //
+ // on entry to cmpbody:
+ // R3 return value if len(a) == len(b)
+ // R5 a addr
+ // R6 b addr
+ // R9 min(len(a),len(b))
+ SETB_INIT()
+ MOVD R3,R5
+ CMP R4,R7,CR0
+ CMP R3,R6,CR7
+ ISEL CR0LT,R4,R7,R9
+ SETB_CR0(R3)
+ BC $12,30,LR // beqlr cr7
BR cmpbody<>(SB)
-equal:
- BEQ CR6,done
- MOVD $1, R8
- BGT CR6,greater
- NEG R8
-greater:
- MOVD R8, R3
- RET
-done:
- MOVD $0, R3
- RET
TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
// incoming:
// R4 a len -> R3
// R5 b addr -> R6
// R6 b len -> R4
- MOVD R6, R7
- MOVD R5, R6
- MOVD R3, R5
- MOVD R4, R3
- MOVD R7, R4
- CMP R5,R6,CR7
- CMP R3,R4,CR6
- BEQ CR7,equal
- MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
- CMP R16,$1
- BNE power8
- BR cmpbodyp9<>(SB)
-power8:
+ //
+ // on entry to cmpbody:
+ // R3 compare value if compared length is same.
+ // R5 a addr
+ // R6 b addr
+ // R9 min(len(a),len(b))
+ SETB_INIT()
+ CMP R4,R6,CR0
+ CMP R3,R5,CR7
+ ISEL CR0LT,R4,R6,R9
+ MOVD R5,R6
+ MOVD R3,R5
+ SETB_CR0(R3)
+ BC $12,30,LR // beqlr cr7
BR cmpbody<>(SB)
-equal:
- BEQ CR6,done
- MOVD $1, R8
- BGT CR6,greater
- NEG R8
-greater:
- MOVD R8, R3
- RET
-
-done:
- MOVD $0, R3
- RET
#ifdef GOARCH_ppc64le
DATA byteswap<>+0(SB)/8, $0x0706050403020100
#define SWAP V21
#endif
-// Do an efficient memcmp for ppc64le/ppc64/POWER8
-// R3 = a len
-// R4 = b len
-// R5 = a addr
-// R6 = b addr
-// On exit:
-// R3 = return value
TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
- MOVD R3,R8 // set up length
- CMP R3,R4,CR2 // unequal?
- BLT CR2,setuplen // BLT CR2
- MOVD R4,R8 // use R4 for comparison len
-setuplen:
- CMP R8,$32 // optimize >= 32
- MOVD R8,R9
- BLT setup8a // optimize < 32
- MOVD $16,R10 // set offsets to load into vectors
- CMP R8,$64
- BLT cmp32 // process size 32-63
-
- DCBT (R5) // optimize >= 64
+start:
+ CMP R9,$16,CR0
+ CMP R9,$32,CR1
+ CMP R9,$64,CR2
+ MOVD $16,R10
+ BLT cmp8
+ BLT CR1,cmp16
+ BLT CR2,cmp32
+
+cmp64: // >= 64B
+ DCBT (R5) // optimize for size>=64
DCBT (R6) // cache hint
+
+ SRD $6,R9,R14 // There is at least one iteration.
+ MOVD R14,CTR
+ ANDCC $63,R9,R9
+ CMP R9,$16,CR1 // Do setup for tail check early on.
+ CMP R9,$32,CR2
+ CMP R9,$48,CR3
+ ADD $-16,R9,R9
+
MOVD $32,R11 // set offsets to load into vector
MOVD $48,R12 // set offsets to load into vector
-loop64a:// process size 64 and greater
+ PCALIGN $32
+cmp64_loop:
LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
VCMPEQUDCC V3,V4,V1
LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
-
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
-
VCMPEQUDCC V3,V4,V1
BGE CR6,different
LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
-
VCMPEQUDCC V3,V4,V1
BGE CR6,different
- ADD $-64,R9,R9 // reduce remaining size by 64
ADD $64,R5,R5 // increment to next 64 bytes of A
ADD $64,R6,R6 // increment to next 64 bytes of B
- CMPU R9,$64
- BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
-
- CMPU R9,$32
- BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
- CMPU R9,$0
- BNE rem // loop to rem if the remainder is not 0
-
- BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
- BLT CR2,less // jump to less if len(A)<len(B)
- BR greater // jump to greater otherwise
-cmp32:
- LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
- LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ BDNZ cmp64_loop
+ BC $12,2,LR // beqlr
+
+ // Finish out tail with minimal overlapped checking.
+ // Note, 0 tail is handled by beqlr above.
+ BLE CR1,cmp64_tail_gt0
+ BLE CR2,cmp64_tail_gt16
+ BLE CR3,cmp64_tail_gt32
+
+cmp64_tail_gt48: // 49 - 63 B
+ LXVD2X (R0)(R5),V3
+ LXVD2X (R0)(R6),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+ LXVD2X (R5)(R10),V3
+ LXVD2X (R6)(R10),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
- LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
- LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
+ LXVD2X (R5)(R11),V3
+ LXVD2X (R6)(R11),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+ BR cmp64_tail_gt0
+
+ PCALIGN $16
+cmp64_tail_gt32: // 33 - 48B
+ LXVD2X (R0)(R5),V3
+ LXVD2X (R0)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
- ADD $-32,R9,R9 // reduce remaining size by 32
- ADD $32,R5,R5 // increment to next 32 bytes of A
- ADD $32,R6,R6 // increment to next 32 bytes of B
- CMPU R9,$0
- BNE rem // loop to rem if the remainder is not 0
- BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
- BLT CR2,less // jump to less if len(A)<len(B)
- BR greater // jump to greater otherwise
-rem:
- MOVD R9,R8
- ANDCC $24,R8,R9 // Any 8 byte chunks?
- BEQ leftover // and result is 0
- BR setup8a
+ LXVD2X (R5)(R10),V3
+ LXVD2X (R6)(R10),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
-different:
-#ifdef GOARCH_ppc64le
- MOVD $byteswap<>+00(SB), R16
- LXVD2X (R16)(R0),SWAP // Set up swap string
+ BR cmp64_tail_gt0
- VPERM V3,V3,SWAP,V3
- VPERM V4,V4,SWAP,V4
-#endif
- MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
- MFVSRD VS36,R10
+ PCALIGN $16
+cmp64_tail_gt16: // 17 - 32B
+ LXVD2X (R0)(R5),V3
+ LXVD2X (R0)(R6),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
- CMPU R16,R10
- BEQ lower
- BGT greater
- MOVD $-1,R3 // return value if A < B
- RET
-lower:
- VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
- MFVSRD VS35,R16
- VSLDOI $8,V4,V4,V4
- MFVSRD VS36,R10
+ BR cmp64_tail_gt0
+
+ PCALIGN $16
+cmp64_tail_gt0: // 1 - 16B
+ LXVD2X (R5)(R9),V3
+ LXVD2X (R6)(R9),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
- CMPU R16,R10
- BGT greater
- MOVD $-1,R3 // return value if A < B
RET
-setup8a:
- SRADCC $3,R8,R9 // get the 8 byte count
- BEQ leftover // shifted value is 0
- CMPU R8,$8 // optimize 8byte move
- BEQ size8
- CMPU R8,$16
- BEQ size16
- MOVD R9,CTR // loop count for doublewords
-loop8:
-#ifdef GOARCH_ppc64le
- MOVDBR (R5+R0),R16 // doublewords to compare
- MOVDBR (R6+R0),R10 // LE compare order
-#else
- MOVD (R5+R0),R16 // doublewords to compare
- MOVD (R6+R0),R10 // BE compare order
-#endif
- ADD $8,R5
- ADD $8,R6
- CMPU R16,R10 // match?
- BC 8,2,loop8 // bt ctr <> 0 && cr
- BGT greater
- BLT less
-leftover:
- ANDCC $7,R8,R9 // check for leftover bytes
- BEQ zeroremainder
-simplecheck:
- MOVD R0,R14
- CMP R9,$4 // process 4 bytes
- BLT halfword
-#ifdef GOARCH_ppc64le
- MOVWBR (R5)(R14),R10
- MOVWBR (R6)(R14),R11
-#else
- MOVWZ (R5)(R14),R10
- MOVWZ (R6)(R14),R11
-#endif
- CMPU R10,R11
- BGT greater
- BLT less
- ADD $-4,R9
- ADD $4,R14
- PCALIGN $16
-halfword:
- CMP R9,$2 // process 2 bytes
- BLT byte
-#ifdef GOARCH_ppc64le
- MOVHBR (R5)(R14),R10
- MOVHBR (R6)(R14),R11
-#else
- MOVHZ (R5)(R14),R10
- MOVHZ (R6)(R14),R11
-#endif
- CMPU R10,R11
- BGT greater
- BLT less
- ADD $-2,R9
- ADD $2,R14
- PCALIGN $16
-byte:
- CMP R9,$0 // process 1 byte
- BEQ skip
- MOVBZ (R5)(R14),R10
- MOVBZ (R6)(R14),R11
- CMPU R10,R11
- BGT greater
- BLT less
- PCALIGN $16
-skip:
- BEQ CR2,equal
- BGT CR2,greater
+ PCALIGN $16
+cmp32: // 32 - 63B
+ ANDCC $31,R9,R9
-less: MOVD $-1,R3 // return value if A < B
- RET
-size16:
- LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
- LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ LXVD2X (R0)(R5),V3
+ LXVD2X (R0)(R6),V4
VCMPEQUDCC V3,V4,V1
BGE CR6,different
-zeroremainder:
- BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
- BLT CR2,less // jump to less if len(A)<len(B)
- BR greater // jump to greater otherwise
-size8:
-#ifdef GOARCH_ppc64le
- MOVDBR (R5+R0),R16 // doublewords to compare
- MOVDBR (R6+R0),R10 // LE compare order
-#else
- MOVD (R5+R0),R16 // doublewords to compare
- MOVD (R6+R0),R10 // BE compare order
-#endif
- CMPU R16,R10 // match?
- BGT greater
- BLT less
- BGT CR2,greater // 2nd len > 1st len
- BLT CR2,less // 2nd len < 1st len
-equal:
- MOVD $0, R3 // return value if A == B
- RET
-greater:
- MOVD $1,R3 // return value if A > B
- RET
-// Do an efficient memcmp for ppc64le/ppc64/POWER9
-// R3 = a len
-// R4 = b len
-// R5 = a addr
-// R6 = b addr
-// On exit:
-// R3 = return value
-TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
- MOVD R3,R8 // set up length
- CMP R3,R4,CR2 // unequal?
- BLT CR2,setuplen // BLT CR2
- MOVD R4,R8 // use R4 for comparison len
-setuplen:
- CMP R8,$16 // optimize for size<16
- MOVD R8,R9
- BLT simplecheck
- MOVD $16,R10 // set offsets to load into vectors
- CMP R8,$32 // optimize for size 16-31
- BLT cmp16
- CMP R8,$64
- BLT cmp32 // optimize for size 32-63
- DCBT (R5) // optimize for size>=64
- DCBT (R6) // cache hint
+ LXVD2X (R10)(R5),V3
+ LXVD2X (R10)(R6),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
- MOVD $32,R11 // set offsets to load into vector
- MOVD $48,R12 // set offsets to load into vector
+ BC $12,2,LR // beqlr
+ ADD R9,R10,R10
-loop64a:// process size 64 and greater
- LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
- LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
- VCMPNEBCC V3,V4,V1 // record comparison into V1
- BNE CR6,different // jump out if its different
+ LXVD2X (R9)(R5),V3
+ LXVD2X (R9)(R6),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
- LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
- LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
- VCMPNEBCC V3,V4,V1
- BNE CR6,different
+ LXVD2X (R10)(R5),V3
+ LXVD2X (R10)(R6),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+ RET
- LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
- LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
- VCMPNEBCC V3,V4,V1
- BNE CR6,different
+ PCALIGN $16
+cmp16: // 16 - 31B
+ ANDCC $15,R9,R9
+ LXVD2X (R0)(R5),V3
+ LXVD2X (R0)(R6),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+ BC $12,2,LR // beqlr
- LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
- LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
- VCMPNEBCC V3,V4,V1
- BNE CR6,different
+ LXVD2X (R9)(R5),V3
+ LXVD2X (R9)(R6),V4
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+ RET
- ADD $-64,R9,R9 // reduce remaining size by 64
- ADD $64,R5,R5 // increment to next 64 bytes of A
- ADD $64,R6,R6 // increment to next 64 bytes of B
- CMPU R9,$64
- BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
-
- CMPU R9,$32
- BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
- CMPU R9,$16
- BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
- CMPU R9,$0
- BNE simplecheck // loop to simplecheck for remaining bytes
-
- BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
- BLT CR2,less // jump to less if len(A)<len(B)
- BR greater // jump to greater otherwise
-cmp32:
- LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
- LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
-
- VCMPNEBCC V3,V4,V1 // record comparison into V1
- BNE CR6,different // jump out if its different
-
- LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
- LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
- VCMPNEBCC V3,V4,V1
- BNE CR6,different
-
- ADD $-32,R9,R9 // reduce remaining size by 32
- ADD $32,R5,R5 // increment to next 32 bytes of A
- ADD $32,R6,R6 // increment to next 32 bytes of B
- CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
- BGE cmp16
- CMPU R9,$0
- BNE simplecheck // loop to simplecheck for remainder bytes
- BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
- BLT CR2,less // jump to less if len(A)<len(B)
- BR greater // jump to greater otherwise
+ PCALIGN $16
different:
+#ifdef GOARCH_ppc64le
+ MOVD $byteswap<>+00(SB),R16
+ LXVD2X (R16)(R0),SWAP // Set up swap string
- MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
+ VPERM V3,V3,SWAP,V3
+ VPERM V4,V4,SWAP,V4
+#endif
+
+ MFVSRD VS35,R16 // move upper doublewords of A and B into GPR for comparison
MFVSRD VS36,R10
CMPU R16,R10
BEQ lower
- BGT greater
- MOVD $-1,R3 // return value if A < B
+ SETB_CR0_NE(R3)
RET
+
+ PCALIGN $16
lower:
- MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
- MFVSRLD VS36,R10
+ VSLDOI $8,V3,V3,V3 // move lower doublewords of A and B into GPR for comparison
+ MFVSRD VS35,R16
+ VSLDOI $8,V4,V4,V4
+ MFVSRD VS36,R10
CMPU R16,R10
- BGT greater
- MOVD $-1,R3 // return value if A < B
+ SETB_CR0_NE(R3)
RET
-greater:
- MOVD $1,R3 // return value if A > B
+ PCALIGN $16
+cmp8: // 8 - 15B
+ CMP R9,$8
+ BLT cmp4
+ ANDCC $7,R9,R9
+ _LDBEX (R0)(R5),R10
+ _LDBEX (R0)(R6),R11
+ _LDBEX (R9)(R5),R12
+ _LDBEX (R9)(R6),R14
+ CMPU R10,R11,CR0
+ SETB_CR0(R5)
+ CMPU R12,R14,CR1
+ SETB_CR1(R6)
+ CRAND CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
+ ISEL CR0EQ,R6,R5,R4
+ ISEL CR1EQ,R3,R4,R3
RET
-cmp16:
- ANDCC $16,R9,R31
- BEQ tail
-
- LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
- LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
- VCMPEQUDCC V3,V4,V1
- BGE CR6,different
-
- ADD $16,R5
- ADD $16,R6
-tail:
- ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
- BEQ end
-
- ADD R9,R5
- ADD R9,R6
- MOVD $-16,R10
- LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
- LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
- VCMPEQUDCC V3,V4,V1
- BGE CR6,different
-end:
- BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
- BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
- BR greater // jump to greater otherwise
-simplecheck:
- MOVD $0,R14 // process 8 bytes
- CMP R9,$8
- BLT word
-#ifdef GOARCH_ppc64le
- MOVDBR (R5+R14),R10
- MOVDBR (R6+R14),R11
-#else
- MOVD (R5+R14),R10
- MOVD (R6+R14),R11
-#endif
- CMPU R10,R11
- BGT greater
- BLT less
- ADD $8,R14
- ADD $-8,R9
PCALIGN $16
-word:
- CMP R9,$4 // process 4 bytes
- BLT halfword
-#ifdef GOARCH_ppc64le
- MOVWBR (R5+R14),R10
- MOVWBR (R6+R14),R11
-#else
- MOVWZ (R5+R14),R10
- MOVWZ (R6+R14),R11
-#endif
+cmp4: // 4 - 7B
+ CMP R9,$4
+ BLT cmp2
+ ANDCC $3,R9,R9
+ _LWBEX (R0)(R5),R10
+ _LWBEX (R0)(R6),R11
+ _LWBEX (R9)(R5),R12
+ _LWBEX (R9)(R6),R14
+ RLDIMI $32,R10,$0,R12
+ RLDIMI $32,R11,$0,R14
+ CMPU R12,R14
+ BR cmp0
+
+ PCALIGN $16
+cmp2: // 2 - 3B
+ CMP R9,$2
+ BLT cmp1
+ ANDCC $1,R9,R9
+ _LHBEX (R0)(R5),R10
+ _LHBEX (R0)(R6),R11
+ _LHBEX (R9)(R5),R12
+ _LHBEX (R9)(R6),R14
+ RLDIMI $32,R10,$0,R12
+ RLDIMI $32,R11,$0,R14
+ CMPU R12,R14
+ BR cmp0
+
+ PCALIGN $16
+cmp1:
+ CMP R9,$0
+ BEQ cmp0
+ MOVBZ (R5),R10
+ MOVBZ (R6),R11
CMPU R10,R11
- BGT greater
- BLT less
- ADD $4,R14
- ADD $-4,R9
- PCALIGN $16
-halfword:
- CMP R9,$2 // process 2 bytes
- BLT byte
-#ifdef GOARCH_ppc64le
- MOVHBR (R5+R14),R10
- MOVHBR (R6+R14),R11
-#else
- MOVHZ (R5+R14),R10
- MOVHZ (R6+R14),R11
-#endif
- CMPU R10,R11
- BGT greater
- BLT less
- ADD $2,R14
- ADD $-2,R9
- PCALIGN $16
-byte:
- CMP R9,$0 // process 1 byte
- BEQ skip
- MOVBZ (R5+R14),R10
- MOVBZ (R6+R14),R11
- CMPU R10,R11
- BGT greater
- BLT less
- PCALIGN $16
-skip:
- BEQ CR2,equal
- BGT CR2,greater
-less:
- MOVD $-1,R3 // return value if A < B
- RET
-equal:
- MOVD $0, R3 // return value if A == B
+cmp0:
+ SETB_CR0(R6)
+ ISEL CR0EQ,R3,R6,R3
RET