internal/bytealg: improve PPC64 equal

author Paul E. Murphy <murp@ibm.com>

Mon, 11 Apr 2022 20:21:03 +0000 (15:21 -0500)

committer Paul Murphy <murp@ibm.com>

Mon, 2 May 2022 20:18:15 +0000 (20:18 +0000)
author Paul E. Murphy <murp@ibm.com>
Mon, 11 Apr 2022 20:21:03 +0000 (15:21 -0500)
committer Paul Murphy <murp@ibm.com>
Mon, 2 May 2022 20:18:15 +0000 (20:18 +0000)
diff --git a/src/internal/bytealg/equal_ppc64x.s b/src/internal/bytealg/equal_ppc64x.s

index 8c9443d6fd9eabc51f6fa624320bc0d1e7ff5598..f2c7cc10f04fbc00a1c5e5227c90d14812012192 100644 (file)
--- a/src/internal/bytealg/equal_ppc64x.s
+++ b/src/internal/bytealg/equal_ppc64x.s
@@ -7,6 +7,21 @@
  #include "go_asm.h"
  #include "textflag.h"
  
+// 4K (smallest case) page size offset mask for PPC64.
+#define PAGE_OFFSET 4095
+
+// TODO: At writing, ISEL and BC do not support CR bit type arguments,
+// define them here for readability.
+#define CR0LT 4*0+0
+#define CR0EQ 4*0+2
+#define CR1LT 4*1+0
+#define CR6LT 4*6+0
+
+// Likewise, the BC opcode is hard to read, and no extended
+// mnemonics are offered for these forms.
+#define BGELR_CR6 BC  4, CR6LT, (LR)
+#define BEQLR     BC 12, CR0EQ, (LR)
+
  // memequal(a, b unsafe.Pointer, size uintptr) bool
  TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
         // R3 = a
@@ -33,66 +48,158 @@ eq:
  // On exit:
  // R3 = return value
  TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
-       MOVD    R5,CTR
-       CMP     R5,$8           // only optimize >=8
-       BLT     simplecheck
-       DCBT    (R3)            // cache hint
-       DCBT    (R4)
-       CMP     R5,$32          // optimize >= 32
-       MOVD    R5,R6           // needed if setup8a branch
-       BLT     setup8a         // 8 byte moves only
-setup32a:                       // 8 byte aligned, >= 32 bytes
-       SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
-       MOVD    R6,CTR
-       MOVD    $16,R14         // index for VSX loads and stores
-loop32a:
-       LXVD2X  (R3+R0), VS32   // VS32 = V0
-       LXVD2X  (R4+R0), VS33   // VS33 = V1
+       MOVD    R3, R8          // Move s1 into R8
+       ADD     R5, R3, R9      // &s1[len(s1)]
+       ADD     R5, R4, R10     // &s2[len(s2)]
+       MOVD    $1, R11
+       CMP     R5, $16         // Use GPR checks for check for len <= 16
+       BLE     check0_16
+       MOVD    $0, R3          // Assume no-match in case BGELR CR6 returns
+       CMP     R5, $32         // Use overlapping VSX loads for len <= 32
+       BLE     check17_32      // Do a pair of overlapping VSR compares
+       CMP     R5, $64
+       BLE     check33_64      // Hybrid check + overlap compare.
+
+setup64:
+       SRD     $6, R5, R6      // number of 64 byte chunks to compare
+       MOVD    R6, CTR
+       MOVD    $16, R14        // index for VSX loads and stores
+       MOVD    $32, R15
+       MOVD    $48, R16
+       ANDCC   $0x3F, R5, R5   // len%64==0?
+
+       PCALIGN $32
+loop64:
+       LXVD2X  (R8+R0), V0
+       LXVD2X  (R4+R0), V1
         VCMPEQUBCC V0, V1, V2   // compare, setting CR6
-       BGE     CR6, noteq
-       LXVD2X  (R3+R14), VS32
-       LXVD2X  (R4+R14), VS33
-       VCMPEQUBCC V0, V1, V2
-       BGE     CR6, noteq
-       ADD     $32,R3          // bump up to next 32
-       ADD     $32,R4
-       BC      16, 0, loop32a  // br ctr and cr
-       ANDCC   $24,R5,R6       // Any 8 byte chunks?
-       BEQ     leftover        // and result is 0
-setup8a:
-       SRADCC  $3,R6,R6        // get the 8 byte count
-       BEQ     leftover        // shifted value is 0
-       MOVD    R6,CTR
-loop8:
-       MOVD    0(R3),R6        // doublewords to compare
-       ADD     $8,R3
-       MOVD    0(R4),R7
-       ADD     $8,R4
-       CMP     R6,R7           // match?
-       BC      8,2,loop8       // bt ctr <> 0 && cr
-       BNE     noteq
-leftover:
-       ANDCC   $7,R5,R6        // check for leftover bytes
-       BEQ     equal
-       MOVD    R6,CTR
-       BR      simple
-simplecheck:
-       CMP     R5,$0
-       BEQ     equal
-simple:
-       MOVBZ   0(R3), R6
-       ADD     $1,R3
-       MOVBZ   0(R4), R7
-       ADD     $1,R4
-       CMP     R6, R7
-       BNE     noteq
-       BC      8,2,simple
-       BNE     noteq
-       BR      equal
-noteq:
-       MOVD    $0, R3
+       BGELR_CR6
+       LXVD2X  (R8+R14), V0
+       LXVD2X  (R4+R14), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+       LXVD2X  (R8+R15), V0
+       LXVD2X  (R4+R15), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+       LXVD2X  (R8+R16), V0
+       LXVD2X  (R4+R16), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+       ADD     $64,R8          // bump up to next 64
+       ADD     $64,R4
+       BDNZ    loop64
+
+       ISEL    $CR0EQ, R11, R3, R3     // If no tail, return 1, otherwise R3 remains 0.
+       BEQLR                           // return if no tail.
+
+       ADD     $-64, R9, R8
+       ADD     $-64, R10, R4
+       LXVD2X  (R8+R0), V0
+       LXVD2X  (R4+R0), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+       LXVD2X  (R8+R14), V0
+       LXVD2X  (R4+R14), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+       LXVD2X  (R8+R15), V0
+       LXVD2X  (R4+R15), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+       LXVD2X  (R8+R16), V0
+       LXVD2X  (R4+R16), V1
+       VCMPEQUBCC      V0, V1, V2
+       ISEL    $CR6LT, R11, R0, R3
         RET
-equal:
-       MOVD    $1, R3
+
+check33_64:
+       // Bytes 0-15
+       LXVD2X  (R8+R0), V0
+       LXVD2X  (R4+R0), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+       ADD     $16, R8
+       ADD     $16, R4
+
+       // Bytes 16-31
+       LXVD2X  (R8+R0), V0
+       LXVD2X  (R4+R0), V1
+       VCMPEQUBCC      V0, V1, V2
+       BGELR_CR6
+
+       // A little tricky, but point R4,R8 to &sx[len-32],
+       // and reuse check17_32 to check the next 1-31 bytes (with some overlap)
+       ADD     $-32, R9, R8
+       ADD     $-32, R10, R4
+       // Fallthrough
+
+check17_32:
+       LXVD2X  (R8+R0), V0
+       LXVD2X  (R4+R0), V1
+       VCMPEQUBCC      V0, V1, V2
+       ISEL    $CR6LT, R11, R0, R5
+
+       // Load sX[len(sX)-16:len(sX)] and compare.
+       ADD     $-16, R9
+       ADD     $-16, R10
+       LXVD2X  (R9+R0), V0
+       LXVD2X  (R10+R0), V1
+       VCMPEQUBCC      V0, V1, V2
+       ISEL    $CR6LT, R5, R0, R3
+       RET
+
+check0_16:
+       CMP     R5, $8
+       BLT     check0_7
+       // Load sX[0:7] and compare.
+       MOVD    (R8), R6
+       MOVD    (R4), R7
+       CMP     R6, R7
+       ISEL    $CR0EQ, R11, R0, R5
+       // Load sX[len(sX)-8:len(sX)] and compare.
+       MOVD    -8(R9), R6
+       MOVD    -8(R10), R7
+       CMP     R6, R7
+       ISEL    $CR0EQ, R5, R0, R3
         RET
  
+check0_7:
+       CMP     R5,$0
+       MOVD    $1, R3
+       BEQLR           // return if len == 0
+
+       // Check < 8B loads with a single compare, but select the load address
+       // such that it cannot cross a page boundary. Load a few bytes from the
+       // lower address if that does not cross the lower page. Or, load a few
+       // extra bytes from the higher addresses. And align those values
+       // consistently in register as either address may have differing
+       // alignment requirements.
+       ANDCC   $PAGE_OFFSET, R8, R6    // &sX & PAGE_OFFSET
+       ANDCC   $PAGE_OFFSET, R4, R9
+       SUBC    R5, $8, R12             // 8-len
+       SLD     $3, R12, R14            // (8-len)*8
+       CMPU    R6, R12, CR1            // Enough bytes lower in the page to load lower?
+       CMPU    R9, R12, CR0
+       SUB     R12, R8, R6             // compute lower load address
+       SUB     R12, R4, R9
+       ISEL    $CR1LT, R8, R6, R8      // R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
+       ISEL    $CR0LT, R4, R9, R4      // Similar for s2
+       MOVD    (R8), R15
+       MOVD    (R4), R16
+       SLD     R14, R15, R7
+       SLD     R14, R16, R17
+       SRD     R14, R7, R7             // Clear the upper (8-len) bytes (with 2 shifts)
+       SRD     R14, R17, R17
+       SRD     R14, R15, R6            // Clear the lower (8-len) bytes
+       SRD     R14, R16, R9
+#ifdef GOARCH_ppc64le
+       ISEL    $CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
+       ISEL    $CR0LT, R17, R9, R4
+#else
+       ISEL    $CR1LT, R6, R7, R8
+       ISEL    $CR0LT, R9, R17, R4
+#endif
+       CMP     R4, R8
+       ISEL    $CR0EQ, R11, R0, R3
+       RET
author	Paul E. Murphy <murp@ibm.com>
	Mon, 11 Apr 2022 20:21:03 +0000 (15:21 -0500)
committer	Paul Murphy <murp@ibm.com>
	Mon, 2 May 2022 20:18:15 +0000 (20:18 +0000)