#include "go_asm.h"
#include "textflag.h"
+// 4K (smallest case) page size offset mask for PPC64.
+#define PAGE_OFFSET 4095
+
+// TODO: At writing, ISEL and BC do not support CR bit type arguments,
+// define them here for readability.
+#define CR0LT 4*0+0
+#define CR0EQ 4*0+2
+#define CR1LT 4*1+0
+#define CR6LT 4*6+0
+
+// Likewise, the BC opcode is hard to read, and no extended
+// mnemonics are offered for these forms.
+#define BGELR_CR6 BC 4, CR6LT, (LR)
+#define BEQLR BC 12, CR0EQ, (LR)
+
// memequal(a, b unsafe.Pointer, size uintptr) bool
TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
// R3 = a
// On exit:
// R3 = return value
TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
- MOVD R5,CTR
- CMP R5,$8 // only optimize >=8
- BLT simplecheck
- DCBT (R3) // cache hint
- DCBT (R4)
- CMP R5,$32 // optimize >= 32
- MOVD R5,R6 // needed if setup8a branch
- BLT setup8a // 8 byte moves only
-setup32a: // 8 byte aligned, >= 32 bytes
- SRADCC $5,R5,R6 // number of 32 byte chunks to compare
- MOVD R6,CTR
- MOVD $16,R14 // index for VSX loads and stores
-loop32a:
- LXVD2X (R3+R0), VS32 // VS32 = V0
- LXVD2X (R4+R0), VS33 // VS33 = V1
+ MOVD R3, R8 // Move s1 into R8
+ ADD R5, R3, R9 // &s1[len(s1)]
+ ADD R5, R4, R10 // &s2[len(s2)]
+ MOVD $1, R11
+ CMP R5, $16 // Use GPR checks for check for len <= 16
+ BLE check0_16
+ MOVD $0, R3 // Assume no-match in case BGELR CR6 returns
+ CMP R5, $32 // Use overlapping VSX loads for len <= 32
+ BLE check17_32 // Do a pair of overlapping VSR compares
+ CMP R5, $64
+ BLE check33_64 // Hybrid check + overlap compare.
+
+setup64:
+ SRD $6, R5, R6 // number of 64 byte chunks to compare
+ MOVD R6, CTR
+ MOVD $16, R14 // index for VSX loads and stores
+ MOVD $32, R15
+ MOVD $48, R16
+ ANDCC $0x3F, R5, R5 // len%64==0?
+
+ PCALIGN $32
+loop64:
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
VCMPEQUBCC V0, V1, V2 // compare, setting CR6
- BGE CR6, noteq
- LXVD2X (R3+R14), VS32
- LXVD2X (R4+R14), VS33
- VCMPEQUBCC V0, V1, V2
- BGE CR6, noteq
- ADD $32,R3 // bump up to next 32
- ADD $32,R4
- BC 16, 0, loop32a // br ctr and cr
- ANDCC $24,R5,R6 // Any 8 byte chunks?
- BEQ leftover // and result is 0
-setup8a:
- SRADCC $3,R6,R6 // get the 8 byte count
- BEQ leftover // shifted value is 0
- MOVD R6,CTR
-loop8:
- MOVD 0(R3),R6 // doublewords to compare
- ADD $8,R3
- MOVD 0(R4),R7
- ADD $8,R4
- CMP R6,R7 // match?
- BC 8,2,loop8 // bt ctr <> 0 && cr
- BNE noteq
-leftover:
- ANDCC $7,R5,R6 // check for leftover bytes
- BEQ equal
- MOVD R6,CTR
- BR simple
-simplecheck:
- CMP R5,$0
- BEQ equal
-simple:
- MOVBZ 0(R3), R6
- ADD $1,R3
- MOVBZ 0(R4), R7
- ADD $1,R4
- CMP R6, R7
- BNE noteq
- BC 8,2,simple
- BNE noteq
- BR equal
-noteq:
- MOVD $0, R3
+ BGELR_CR6
+ LXVD2X (R8+R14), V0
+ LXVD2X (R4+R14), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R15), V0
+ LXVD2X (R4+R15), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R16), V0
+ LXVD2X (R4+R16), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ ADD $64,R8 // bump up to next 64
+ ADD $64,R4
+ BDNZ loop64
+
+ ISEL $CR0EQ, R11, R3, R3 // If no tail, return 1, otherwise R3 remains 0.
+ BEQLR // return if no tail.
+
+ ADD $-64, R9, R8
+ ADD $-64, R10, R4
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R14), V0
+ LXVD2X (R4+R14), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R15), V0
+ LXVD2X (R4+R15), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ LXVD2X (R8+R16), V0
+ LXVD2X (R4+R16), V1
+ VCMPEQUBCC V0, V1, V2
+ ISEL $CR6LT, R11, R0, R3
RET
-equal:
- MOVD $1, R3
+
+check33_64:
+ // Bytes 0-15
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+ ADD $16, R8
+ ADD $16, R4
+
+ // Bytes 16-31
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ BGELR_CR6
+
+ // A little tricky, but point R4,R8 to &sx[len-32],
+ // and reuse check17_32 to check the next 1-31 bytes (with some overlap)
+ ADD $-32, R9, R8
+ ADD $-32, R10, R4
+ // Fallthrough
+
+check17_32:
+ LXVD2X (R8+R0), V0
+ LXVD2X (R4+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ ISEL $CR6LT, R11, R0, R5
+
+ // Load sX[len(sX)-16:len(sX)] and compare.
+ ADD $-16, R9
+ ADD $-16, R10
+ LXVD2X (R9+R0), V0
+ LXVD2X (R10+R0), V1
+ VCMPEQUBCC V0, V1, V2
+ ISEL $CR6LT, R5, R0, R3
+ RET
+
+check0_16:
+ CMP R5, $8
+ BLT check0_7
+ // Load sX[0:7] and compare.
+ MOVD (R8), R6
+ MOVD (R4), R7
+ CMP R6, R7
+ ISEL $CR0EQ, R11, R0, R5
+ // Load sX[len(sX)-8:len(sX)] and compare.
+ MOVD -8(R9), R6
+ MOVD -8(R10), R7
+ CMP R6, R7
+ ISEL $CR0EQ, R5, R0, R3
RET
+check0_7:
+ CMP R5,$0
+ MOVD $1, R3
+ BEQLR // return if len == 0
+
+ // Check < 8B loads with a single compare, but select the load address
+ // such that it cannot cross a page boundary. Load a few bytes from the
+ // lower address if that does not cross the lower page. Or, load a few
+ // extra bytes from the higher addresses. And align those values
+ // consistently in register as either address may have differing
+ // alignment requirements.
+ ANDCC $PAGE_OFFSET, R8, R6 // &sX & PAGE_OFFSET
+ ANDCC $PAGE_OFFSET, R4, R9
+ SUBC R5, $8, R12 // 8-len
+ SLD $3, R12, R14 // (8-len)*8
+ CMPU R6, R12, CR1 // Enough bytes lower in the page to load lower?
+ CMPU R9, R12, CR0
+ SUB R12, R8, R6 // compute lower load address
+ SUB R12, R4, R9
+ ISEL $CR1LT, R8, R6, R8 // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
+ ISEL $CR0LT, R4, R9, R4 // Similar for s2
+ MOVD (R8), R15
+ MOVD (R4), R16
+ SLD R14, R15, R7
+ SLD R14, R16, R17
+ SRD R14, R7, R7 // Clear the upper (8-len) bytes (with 2 shifts)
+ SRD R14, R17, R17
+ SRD R14, R15, R6 // Clear the lower (8-len) bytes
+ SRD R14, R16, R9
+#ifdef GOARCH_ppc64le
+ ISEL $CR1LT, R7, R6, R8 // Choose the correct len bytes to compare based on alignment
+ ISEL $CR0LT, R17, R9, R4
+#else
+ ISEL $CR1LT, R6, R7, R8
+ ISEL $CR0LT, R9, R17, R4
+#endif
+ CMP R4, R8
+ ISEL $CR0EQ, R11, R0, R3
+ RET