runtime: improve IndexByte for ppc64x

author Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>

Mon, 7 Aug 2017 18:44:38 +0000 (15:44 -0300)

committer Lynn Boger <laboger@linux.vnet.ibm.com>

Mon, 6 Nov 2017 21:56:18 +0000 (21:56 +0000)
author Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Mon, 7 Aug 2017 18:44:38 +0000 (15:44 -0300)
committer Lynn Boger <laboger@linux.vnet.ibm.com>
Mon, 6 Nov 2017 21:56:18 +0000 (21:56 +0000)
diff --git a/src/cmd/asm/internal/asm/testdata/ppc64.s b/src/cmd/asm/internal/asm/testdata/ppc64.s

index a12a4b55d5fe61a9de2acc88600061ba582cb545..2909c390945a3a9d4de2690bdf9e4d8c8f9774fd 100644 (file)
--- a/src/cmd/asm/internal/asm/testdata/ppc64.s
+++ b/src/cmd/asm/internal/asm/testdata/ppc64.s
@@ -932,6 +932,12 @@ label1:
  //     <mnemonic> VRT,VRA,VRB,VRC
         VPERM V3, V2, V1, V0
  
+//     Vector bit permute, VX-form
+//     <MNEMONIC> VRA,VRB,VRT produces
+//     <mnemonic> VRT,VRA,VRB
+       VBPERMQ V3,V1,V2
+       VBPERMD V3,V1,V2
+
  //     Vector select, VA-form
  //     <MNEMONIC> VRA,VRB,VRC,VRT produces
  //     <mnemonic> VRT,VRA,VRB,VRC
diff --git a/src/cmd/internal/obj/ppc64/a.out.go b/src/cmd/internal/obj/ppc64/a.out.go

index 6b5a1b4351d886ffcc2359793580f2848c7f8ef2..e684281774e6a57b5a2f253573dc333b8dc721ae 100644 (file)
--- a/src/cmd/internal/obj/ppc64/a.out.go
+++ b/src/cmd/internal/obj/ppc64/a.out.go
@@ -859,6 +859,8 @@ const (
         AVCMPNEZB
         AVCMPNEZBCC
         AVPERM
+       AVBPERMQ
+       AVBPERMD
         AVSEL
         AVSPLT
         AVSPLTB
diff --git a/src/cmd/internal/obj/ppc64/anames.go b/src/cmd/internal/obj/ppc64/anames.go

index 142b53eaddb72251013281da73cf0668ed639c52..b7ca1330573a7254a211958465b588674514c1ba 100644 (file)
--- a/src/cmd/internal/obj/ppc64/anames.go
+++ b/src/cmd/internal/obj/ppc64/anames.go
@@ -474,6 +474,8 @@ var Anames = []string{
         "VCMPNEZB",
         "VCMPNEZBCC",
         "VPERM",
+       "VBPERMQ",
+       "VBPERMD",
         "VSEL",
         "VSPLT",
         "VSPLTB",
diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go

index 2b8efe846d5902fced8ae25fa281059d6f6630cf..1f488d5e4d2bf1d7d09dcdc54b989d63394acad3 100644 (file)
--- a/src/cmd/internal/obj/ppc64/asm9.go
+++ b/src/cmd/internal/obj/ppc64/asm9.go
@@ -421,6 +421,9 @@ var optab = []Optab{
         /* Vector permute */
         {AVPERM, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector permute, va-form */
  
+       /* Vector bit permute */
+       {AVBPERMQ, C_VREG, C_VREG, C_NONE, C_VREG, 82, 4, 0}, /* vector bit permute, vx-form */
+
         /* Vector select */
         {AVSEL, C_VREG, C_VREG, C_VREG, C_VREG, 83, 4, 0}, /* vector select, va-form */
  
@@ -1378,6 +1381,9 @@ func buildop(ctxt *obj.Link) {
                 case AVPERM: /* vperm */
                         opset(AVPERM, r0)
  
+               case AVBPERMQ: /* vbpermq, vbpermd */
+                       opset(AVBPERMD, r0)
+
                 case AVSEL: /* vsel */
                         opset(AVSEL, r0)
  
@@ -4165,6 +4171,11 @@ func (c *ctxt9) oprrr(a obj.As) uint32 {
         case AVSRAD:
                 return OPVX(4, 964, 0, 0) /* vsrad - v2.07 */
  
+       case AVBPERMQ:
+               return OPVC(4, 1356, 0, 0) /* vbpermq - v2.07 */
+       case AVBPERMD:
+               return OPVC(4, 1484, 0, 0) /* vbpermd - v3.00 */
+
         case AVCLZB:
                 return OPVX(4, 1794, 0, 0) /* vclzb - v2.07 */
         case AVCLZH:
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s

index 487187f4d88674c25b68697eb477d0df13a78752..e02ca169071d4495fc127f87d2b98f40dcf5c71e 100644 (file)
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -1084,24 +1084,17 @@ TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
  
  TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
         DCBT    (R3)            // Prepare cache line.
-       MOVD    R3,R10          // Save base address for calculating the index later.
+       MOVD    R3,R17          // Save base address for calculating the index later.
         RLDICR  $0,R3,$60,R8    // Align address to doubleword boundary in R8.
         RLDIMI  $8,R5,$48,R5    // Replicating the byte across the register.
-
-       // Calculate last acceptable address and check for possible overflow
-       // using a saturated add.
-       // Overflows set last acceptable address to 0xffffffffffffffff.
-       ADD     R4,R3,R7
-       SUBC    R3,R7,R6
-       SUBE    R0,R0,R9
-       MOVW    R9,R6
-       OR      R6,R7,R7
+       ADD     R4,R3,R7        // Last acceptable address in R7.
  
         RLDIMI  $16,R5,$32,R5
         CMPU    R4,$32          // Check if it's a small string (<32 bytes). Those will be processed differently.
         MOVD    $-1,R9
-       WORD $0x54661EB8        // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+       WORD    $0x54661EB8     // Calculate padding in R6 (rlwinm r6,r3,3,26,28).
         RLDIMI  $32,R5,$0,R5
+       MOVD    R7,R10          // Save last acceptable address in R10 for later.
         ADD     $-1,R7,R7
  #ifdef GOARCH_ppc64le
         SLD     R6,R9,R9        // Prepare mask for Little Endian
@@ -1110,56 +1103,142 @@ TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
  #endif
         BLE     small_string    // Jump to the small string case if it's <32 bytes.
  
-       // Case for length >32 bytes
+       // If we are 64-byte aligned, branch to qw_align just to get the auxiliary values
+       // in V0, V1 and V10, then branch to the preloop.
+       ANDCC   $63,R3,R11
+       BEQ     CR0,qw_align
+       RLDICL  $0,R3,$61,R11
+
         MOVD    0(R8),R12       // Load one doubleword from the aligned address in R8.
         CMPB    R12,R5,R3       // Check for a match.
         AND     R9,R3,R3        // Mask bytes below s_base
-       RLDICL  $0,R7,$61,R4    // length-1
+       RLDICL  $0,R7,$61,R6    // length-1
         RLDICR  $0,R7,$60,R7    // Last doubleword in R7
         CMPU    R3,$0,CR7       // If we have a match, jump to the final computation
         BNE     CR7,done
+       ADD     $8,R8,R8
+       ADD     $-8,R4,R4
+       ADD     R4,R11,R4
  
-       // Check for doubleword alignment and jump to the loop setup if aligned.
-       MOVFL   R8,CR7
-       BC      12,28,loop_setup
+       // Check for quadword alignment
+       ANDCC   $15,R8,R11
+       BEQ     CR0,qw_align
  
-       // Not aligned, so handle the second doubleword
-       MOVDU   8(R8),R12
+       // Not aligned, so handle the next doubleword
+       MOVD    0(R8),R12
         CMPB    R12,R5,R3
         CMPU    R3,$0,CR7
         BNE     CR7,done
+       ADD     $8,R8,R8
+       ADD     $-8,R4,R4
+
+       // Either quadword aligned or 64-byte at this point. We can use LVX.
+qw_align:
+
+       // Set up auxiliary data for the vectorized algorithm.
+       VSPLTISB  $0,V0         // Replicate 0 across V0
+       VSPLTISB  $3,V10        // Use V10 as control for VBPERMQ
+       MTVRD     R5,V1
+       LVSL      (R0+R0),V11
+       VSLB      V11,V10,V10
+       VSPLTB    $7,V1,V1      // Replicate byte across V1
+       CMPU      R4, $64       // If len <= 64, don't use the vectorized loop
+       BLE       tail
+
+       // We will load 4 quardwords per iteration in the loop, so check for
+       // 64-byte alignment. If 64-byte aligned, then branch to the preloop.
+       ANDCC     $63,R8,R11
+       BEQ       CR0,preloop
+
+       // Not 64-byte aligned. Load one quadword at a time until aligned.
+       LVX         (R8+R0),V4
+       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
+       BNE         CR6,found_qw_align
+       ADD         $16,R8,R8
+       ADD         $-16,R4,R4
+
+       ANDCC       $63,R8,R11
+       BEQ         CR0,preloop
+       LVX         (R8+R0),V4
+       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
+       BNE         CR6,found_qw_align
+       ADD         $16,R8,R8
+       ADD         $-16,R4,R4
+
+       ANDCC       $63,R8,R11
+       BEQ         CR0,preloop
+       LVX         (R8+R0),V4
+       VCMPEQUBCC  V1,V4,V6            // Check for byte in V4
+       BNE         CR6,found_qw_align
+       ADD         $-16,R4,R4
+       ADD         $16,R8,R8
+
+       // 64-byte aligned. Prepare for the main loop.
+preloop:
+       CMPU    R4,$64
+       BLE     tail          // If len <= 64, don't use the vectorized loop
+
+       // We are now aligned to a 64-byte boundary. We will load 4 quadwords
+       // per loop iteration. The last doubleword is in R10, so our loop counter
+       // starts at (R10-R8)/64.
+       SUB     R8,R10,R6
+       SRD     $6,R6,R9      // Loop counter in R9
+       MOVD    R9,CTR
  
-loop_setup:
-       // We are now aligned to a 16-byte boundary. We will load two doublewords
-       // per loop iteration. The last doubleword is in R7, so our loop counter
-       // starts at (R7-R8)/16.
-       SUB     R8,R7,R6
-       SRD     $4,R6,R6
-       MOVD    R6,CTR
+       MOVD    $16,R11      // Load offsets for the vector loads
+       MOVD    $32,R9
+       MOVD    $48,R7
  
-       // Note: when we have an align directive, align this loop to 32 bytes so
-       // it fits in a single icache sector.
+       // Main loop we will load 64 bytes per iteration
  loop:
-       // Load two doublewords, then compare and merge in a single register. We
-       // will check two doublewords per iteration, then find out which of them
-       // contains the byte later. This speeds up the search.
-       MOVD    8(R8),R12
-       MOVDU   16(R8),R11
-       CMPB    R12,R5,R3
-       CMPB    R11,R5,R9
-       OR      R3,R9,R6
-       CMPU    R6,$0,CR7
-       BNE     CR7,found
-       BC      16,0,loop
-
-       // Counter zeroed, but we may have another doubleword to read
-       CMPU    R8,R7
-       BEQ     notfound
-
-       MOVDU   8(R8),R12
-       CMPB    R12,R5,R3
-       CMPU    R3,$0,CR6
-       BNE     CR6,done
+       LVX         (R8+R0),V2        // Load 4 16-byte vectors
+       LVX         (R11+R8),V3
+       LVX         (R9+R8),V4
+       LVX         (R7+R8),V5
+       VCMPEQUB    V1,V2,V6          // Look for byte in each vector
+       VCMPEQUB    V1,V3,V7
+       VCMPEQUB    V1,V4,V8
+       VCMPEQUB    V1,V5,V9
+       VOR         V6,V7,V11         // Compress the result in a single vector
+       VOR         V8,V9,V12
+       VOR         V11,V12,V11
+       VCMPEQUBCC  V0,V11,V11        // Check for byte
+       BGE         CR6,found
+       ADD         $64,R8,R8
+       BC          16,0,loop         // bdnz loop
+
+       // Handle the tailing bytes or R4 <= 64
+       RLDICL  $0,R6,$58,R4
+tail:
+       CMPU        R4,$0
+       BEQ         notfound
+       LVX         (R8+R0),V4
+       VCMPEQUBCC  V1,V4,V6
+       BNE         CR6,found_qw_align
+       ADD         $16,R8,R8
+       CMPU        R4,$16,CR6
+       BLE         CR6,notfound
+       ADD         $-16,R4,R4
+
+       LVX         (R8+R0),V4
+       VCMPEQUBCC  V1,V4,V6
+       BNE         CR6,found_qw_align
+       ADD         $16,R8,R8
+       CMPU        R4,$16,CR6
+       BLE         CR6,notfound
+       ADD         $-16,R4,R4
+
+       LVX         (R8+R0),V4
+       VCMPEQUBCC  V1,V4,V6
+       BNE         CR6,found_qw_align
+       ADD         $16,R8,R8
+       CMPU        R4,$16,CR6
+       BLE         CR6,notfound
+       ADD         $-16,R4,R4
+
+       LVX         (R8+R0),V4
+       VCMPEQUBCC  V1,V4,V6
+       BNE         CR6,found_qw_align
  
  notfound:
         MOVD    $-1,R3
@@ -1167,15 +1246,68 @@ notfound:
         RET
  
  found:
-       // One of the doublewords from the loop contains the byte we are looking
-       // for. Check the first doubleword and adjust the address if found.
-       CMPU    R3,$0,CR6
-       ADD     $-8,R8,R8
-       BNE     CR6,done
+       // We will now compress the results into a single doubleword,
+       // so it can be moved to a GPR for the final index calculation.
+
+       // The bytes in V6-V9 are either 0x00 or 0xFF. So, permute the
+       // first bit of each byte into bits 48-63.
+       VBPERMQ   V6,V10,V6
+       VBPERMQ   V7,V10,V7
+       VBPERMQ   V8,V10,V8
+       VBPERMQ   V9,V10,V9
+
+       // Shift each 16-bit component into its correct position for
+       // merging into a single doubleword.
+#ifdef GOARCH_ppc64le
+       VSLDOI    $2,V7,V7,V7
+       VSLDOI    $4,V8,V8,V8
+       VSLDOI    $6,V9,V9,V9
+#else
+       VSLDOI    $6,V6,V6,V6
+       VSLDOI    $4,V7,V7,V7
+       VSLDOI    $2,V8,V8,V8
+#endif
  
-       // Not found, so it must be in the second doubleword of the merged pair.
-       MOVD    R9,R3
-       ADD     $8,R8,R8
+       // Merge V6-V9 into a single doubleword and move to a GPR.
+       VOR     V6,V7,V11
+       VOR     V8,V9,V4
+       VOR     V4,V11,V4
+       MFVRD   V4,R3
+
+#ifdef GOARCH_ppc64le
+       ADD       $-1,R3,R11
+       ANDN      R3,R11,R11
+       POPCNTD   R11,R11       // Count trailing zeros (Little Endian).
+#else
+       CNTLZD  R3,R11          // Count leading zeros (Big Endian).
+#endif
+       ADD     R8,R11,R3       // Calculate byte address
+
+return:
+       SUB     R17,R3
+       MOVD    R3,(R14)
+       RET
+
+found_qw_align:
+       // Use the same algorithm as above. Compress the result into
+       // a single doubleword and move it to a GPR for the final
+       // calculation.
+       VBPERMQ   V6,V10,V6
+
+#ifdef GOARCH_ppc64le
+       MFVRD     V6,R3
+       ADD       $-1,R3,R11
+       ANDN      R3,R11,R11
+       POPCNTD   R11,R11
+#else
+       VSLDOI    $6,V6,V6,V6
+       MFVRD     V6,R3
+       CNTLZD    R3,R11
+#endif
+       ADD       R8,R11,R3
+       CMPU      R11,R4
+       BLT       return
+       BR        notfound
  
  done:
         // At this point, R3 has 0xFF in the same position as the byte we are
@@ -1191,17 +1323,10 @@ done:
         CMPU    R8,R7           // Check if we are at the last doubleword.
         SRD     $3,R11          // Convert trailing zeros to bytes.
         ADD     R11,R8,R3
-       CMPU    R11,R4,CR7      // If at the last doubleword, check the byte offset.
+       CMPU    R11,R6,CR7      // If at the last doubleword, check the byte offset.
         BNE     return
         BLE     CR7,return
-       MOVD    $-1,R3
-       MOVD    R3,(R14)
-       RET
-
-return:
-       SUB     R10,R3          // Calculate index.
-       MOVD    R3,(R14)
-       RET
+       BR      notfound
  
  small_string:
         // We unroll this loop for better performance.
@@ -1212,9 +1337,9 @@ small_string:
         CMPB    R12,R5,R3       // Check for a match.
         AND     R9,R3,R3        // Mask bytes below s_base.
         CMPU    R3,$0,CR7       // If we have a match, jump to the final computation.
-       RLDICL  $0,R7,$61,R4    // length-1
+       RLDICL  $0,R7,$61,R6    // length-1
         RLDICR  $0,R7,$60,R7    // Last doubleword in R7.
-        CMPU   R8,R7
+       CMPU    R8,R7
         BNE     CR7,done
         BEQ     notfound        // Hit length.
  
@@ -1242,7 +1367,6 @@ small_string:
         MOVDU   8(R8),R12
         CMPB    R12,R5,R3
         CMPU    R3,$0,CR6
-       CMPU    R8,R7
         BNE     CR6,done
         BR      notfound
author	Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
	Mon, 7 Aug 2017 18:44:38 +0000 (15:44 -0300)
committer	Lynn Boger <laboger@linux.vnet.ibm.com>
	Mon, 6 Nov 2017 21:56:18 +0000 (21:56 +0000)
src/cmd/asm/internal/asm/testdata/ppc64.s		patch \| blob \| history
src/cmd/internal/obj/ppc64/a.out.go		patch \| blob \| history
src/cmd/internal/obj/ppc64/anames.go		patch \| blob \| history
src/cmd/internal/obj/ppc64/asm9.go		patch \| blob \| history
src/runtime/asm_ppc64x.s		patch \| blob \| history