hash/crc32: use vector instructions on s390x

author Chris Zou <chriszou@ca.ibm.com>

Mon, 18 Apr 2016 23:30:17 +0000 (19:30 -0400)

committer Brad Fitzpatrick <bradfitz@golang.org>

Fri, 22 Apr 2016 18:07:15 +0000 (18:07 +0000)
author Chris Zou <chriszou@ca.ibm.com>
Mon, 18 Apr 2016 23:30:17 +0000 (19:30 -0400)
committer Brad Fitzpatrick <bradfitz@golang.org>
Fri, 22 Apr 2016 18:07:15 +0000 (18:07 +0000)
diff --git a/src/hash/crc32/crc32_generic.go b/src/hash/crc32/crc32_generic.go

index 62fa72028c30f3cc9266d4f3fb32bb070cfea6d0..10a6367bc9007c0c42a045eb6437cc6381c58f31 100644 (file)
--- a/src/hash/crc32/crc32_generic.go
+++ b/src/hash/crc32/crc32_generic.go
@@ -2,7 +2,7 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-// +build !amd64,!amd64p32
+// +build !amd64,!amd64p32,!s390x
  
  package crc32
  
diff --git a/src/hash/crc32/crc32_s390x.go b/src/hash/crc32/crc32_s390x.go

new file mode 100644 (file)

index 0000000..2f20690
--- /dev/null
+++ b/src/hash/crc32/crc32_s390x.go
@@ -0,0 +1,101 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package crc32
+
+import (
+       "unsafe"
+)
+
+const (
+       vxMinLen    = 64
+       vxAlignment = 16
+       vxAlignMask = vxAlignment - 1
+)
+
+// hasVectorFacility reports whether the machine has the z/Architecture
+// vector facility installed and enabled.
+func hasVectorFacility() bool
+
+var hasVX = hasVectorFacility()
+
+// vectorizedCastagnoli implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedCastagnoli(crc uint32, p []byte) uint32
+
+// vectorizedIEEE implements CRC32 using vector instructions.
+// It is defined in crc32_s390x.s.
+//go:noescape
+func vectorizedIEEE(crc uint32, p []byte) uint32
+
+func genericCastagnoli(crc uint32, p []byte) uint32 {
+       // Use slicing-by-8 on larger inputs.
+       if len(p) >= sliceBy8Cutoff {
+               return updateSlicingBy8(crc, castagnoliTable8, p)
+       }
+       return update(crc, castagnoliTable, p)
+}
+
+func genericIEEE(crc uint32, p []byte) uint32 {
+       // Use slicing-by-8 on larger inputs.
+       if len(p) >= sliceBy8Cutoff {
+               ieeeTable8Once.Do(func() {
+                       ieeeTable8 = makeTable8(IEEE)
+               })
+               return updateSlicingBy8(crc, ieeeTable8, p)
+       }
+       return update(crc, IEEETable, p)
+}
+
+// updateCastagnoli calculates the checksum of p using genericCastagnoli to
+// align the data appropriately for vectorCastagnoli. It avoids using
+// vectorCastagnoli entirely if the length of p is less than or equal to
+// vxMinLen.
+func updateCastagnoli(crc uint32, p []byte) uint32 {
+       // Use vectorized function if vector facility is available and
+       // data length is above threshold.
+       if hasVX && len(p) > vxMinLen {
+               pAddr := uintptr(unsafe.Pointer(&p[0]))
+               if pAddr&vxAlignMask != 0 {
+                       prealign := vxAlignment - int(pAddr&vxAlignMask)
+                       crc = genericCastagnoli(crc, p[:prealign])
+                       p = p[prealign:]
+               }
+               aligned := len(p) & ^vxAlignMask
+               crc = vectorizedCastagnoli(crc, p[:aligned])
+               p = p[aligned:]
+               // process remaining data
+               if len(p) > 0 {
+                       crc = genericCastagnoli(crc, p)
+               }
+               return crc
+       }
+       return genericCastagnoli(crc, p)
+}
+
+// updateIEEE calculates the checksum of p using genericIEEE to align the data
+// appropriately for vectorIEEE. It avoids using vectorIEEE entirely if the length
+// of p is less than or equal to vxMinLen.
+func updateIEEE(crc uint32, p []byte) uint32 {
+       // Use vectorized function if vector facility is available and
+       // data length is above threshold.
+       if hasVX && len(p) > vxMinLen {
+               pAddr := uintptr(unsafe.Pointer(&p[0]))
+               if pAddr&vxAlignMask != 0 {
+                       prealign := vxAlignment - int(pAddr&vxAlignMask)
+                       crc = genericIEEE(crc, p[:prealign])
+                       p = p[prealign:]
+               }
+               aligned := len(p) & ^vxAlignMask
+               crc = vectorizedIEEE(crc, p[:aligned])
+               p = p[aligned:]
+               // process remaining data
+               if len(p) > 0 {
+                       crc = genericIEEE(crc, p)
+               }
+               return crc
+       }
+       return genericIEEE(crc, p)
+}
diff --git a/src/hash/crc32/crc32_s390x.s b/src/hash/crc32/crc32_s390x.s

new file mode 100644 (file)

index 0000000..f8d39f3
--- /dev/null
+++ b/src/hash/crc32/crc32_s390x.s
@@ -0,0 +1,245 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Vector register range containing CRC-32 constants
+
+#define CONST_PERM_LE2BE        V9
+#define CONST_R2R1              V10
+#define CONST_R4R3              V11
+#define CONST_R5                V12
+#define CONST_RU_POLY           V13
+#define CONST_CRC_POLY          V14
+
+
+// The CRC-32 constant block contains reduction constants to fold and
+// process particular chunks of the input data stream in parallel.
+//
+// Note that the constant definitions below are extended in order to compute
+// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+// The rightmost doubleword can be 0 to prevent contribution to the result or
+// can be multiplied by 1 to perform an XOR without the need for a separate
+// VECTOR EXCLUSIVE OR instruction.
+//
+// The polynomials used are bit-reflected:
+//
+//            IEEE: P'(x) = 0x0edb88320
+//      Castagnoli: P'(x) = 0x082f63b78
+
+
+// IEEE polynomial constants
+DATA    ·crclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908       // LE-to-BE mask
+DATA    ·crclecons+8(SB)/8,  $0x0706050403020100
+DATA    ·crclecons+16(SB)/8, $0x00000001c6e41596       // R2
+DATA    ·crclecons+24(SB)/8, $0x0000000154442bd4       // R1
+DATA    ·crclecons+32(SB)/8, $0x00000000ccaa009e       // R4
+DATA    ·crclecons+40(SB)/8, $0x00000001751997d0       // R3
+DATA    ·crclecons+48(SB)/8, $0x0000000000000000
+DATA    ·crclecons+56(SB)/8, $0x0000000163cd6124       // R5
+DATA    ·crclecons+64(SB)/8, $0x0000000000000000
+DATA    ·crclecons+72(SB)/8, $0x00000001F7011641       // u'
+DATA    ·crclecons+80(SB)/8, $0x0000000000000000
+DATA    ·crclecons+88(SB)/8, $0x00000001DB710641       // P'(x) << 1
+
+GLOBL    ·crclecons(SB),RODATA, $144
+
+// Castagonli Polynomial constants
+DATA    ·crcclecons+0(SB)/8,  $0x0F0E0D0C0B0A0908      // LE-to-BE mask
+DATA    ·crcclecons+8(SB)/8,  $0x0706050403020100
+DATA    ·crcclecons+16(SB)/8, $0x000000009e4addf8      // R2
+DATA    ·crcclecons+24(SB)/8, $0x00000000740eef02      // R1
+DATA    ·crcclecons+32(SB)/8, $0x000000014cd00bd6      // R4
+DATA    ·crcclecons+40(SB)/8, $0x00000000f20c0dfe      // R3
+DATA    ·crcclecons+48(SB)/8, $0x0000000000000000
+DATA    ·crcclecons+56(SB)/8, $0x00000000dd45aab8      // R5
+DATA    ·crcclecons+64(SB)/8, $0x0000000000000000
+DATA    ·crcclecons+72(SB)/8, $0x00000000dea713f1      // u'
+DATA    ·crcclecons+80(SB)/8, $0x0000000000000000
+DATA    ·crcclecons+88(SB)/8, $0x0000000105ec76f0      // P'(x) << 1
+
+GLOBL   ·crcclecons(SB),RODATA, $144
+
+// func hasVectorFacility() bool
+TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
+       MOVD    $x-24(SP), R1
+       XC      $24, 0(R1), 0(R1) // clear the storage
+       MOVD    $2, R0            // R0 is the number of double words stored -1
+       WORD    $0xB2B01000       // STFLE 0(R1)
+       XOR     R0, R0            // reset the value of R0
+       MOVBZ   z-8(SP), R1
+       AND     $0x40, R1
+       BEQ     novector
+vectorinstalled:
+       // check if the vector instruction has been enabled
+       VLEIB   $0, $0xF, V16
+       VLGVB   $0, V16, R1
+       CMPBNE  R1, $0xF, novector
+       MOVB    $1, ret+0(FP) // have vx
+       RET
+novector:
+       MOVB    $0, ret+0(FP) // no vx
+       RET
+
+
+// The CRC-32 function(s) use these calling conventions:
+//
+// Parameters:
+//
+//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
+//      R3:    Input buffer pointer, performance might be improved if the
+//             buffer is on a doubleword boundary.
+//      R4:    Length of the buffer, must be 64 bytes or greater.
+//
+// Register usage:
+//
+//      R5:     CRC-32 constant pool base pointer.
+//      V0:     Initial CRC value and intermediate constants and results.
+//      V1..V4: Data for CRC computation.
+//      V5..V8: Next data chunks that are fetched from the input buffer.
+//
+//      V9..V14: CRC-32 constants.
+
+// func vectorizedIEEE(crc uint32, p []byte) uint32
+TEXT ·vectorizedIEEE(SB),NOSPLIT,$0
+       MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
+       MOVD    p+8(FP), R3       // data pointer
+       MOVD    p_len+16(FP), R4  // len(p)
+
+       MOVD    $·crclecons(SB), R5
+       BR      vectorizedBody<>(SB)
+
+// func vectorizedCastagnoli(crc uint32, p []byte) uint32
+TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0
+       MOVWZ   crc+0(FP), R2     // R2 stores the CRC value
+       MOVD    p+8(FP), R3       // data pointer
+       MOVD    p_len+16(FP), R4  // len(p)
+
+       // R5: crc-32 constant pool base pointer, constant is used to reduce crc
+       MOVD    $·crcclecons(SB), R5
+       BR      vectorizedBody<>(SB)
+
+TEXT vectorizedBody<>(SB),NOSPLIT,$0
+       XOR     $0xffffffff, R2 // NOTW R2
+       VLM     0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
+
+       // Load the initial CRC value into the rightmost word of V0
+       VZERO   V0
+       VLVGF   $3, R2, V0
+
+       // Load a 64-byte data chunk and XOR with CRC
+       VLM     0(R3), V1, V4    // 64-bytes into V1..V4
+
+       // Reflect the data if the CRC operation is in the bit-reflected domain
+       VPERM   V1, V1, CONST_PERM_LE2BE, V1
+       VPERM   V2, V2, CONST_PERM_LE2BE, V2
+       VPERM   V3, V3, CONST_PERM_LE2BE, V3
+       VPERM   V4, V4, CONST_PERM_LE2BE, V4
+
+       VX      V0, V1, V1     // V1 ^= CRC
+       ADD     $64, R3        // BUF = BUF + 64
+       ADD     $(-64), R4
+
+       // Check remaining buffer size and jump to proper folding method
+       CMP     R4, $64
+       BLT     less_than_64bytes
+
+fold_64bytes_loop:
+       // Load the next 64-byte data chunk into V5 to V8
+       VLM     0(R3), V5, V8
+       VPERM   V5, V5, CONST_PERM_LE2BE, V5
+       VPERM   V6, V6, CONST_PERM_LE2BE, V6
+       VPERM   V7, V7, CONST_PERM_LE2BE, V7
+       VPERM   V8, V8, CONST_PERM_LE2BE, V8
+
+
+       // Perform a GF(2) multiplication of the doublewords in V1 with
+       // the reduction constants in V0.  The intermediate result is
+       // then folded (accumulated) with the next data chunk in V5 and
+       // stored in V1.  Repeat this step for the register contents
+       // in V2, V3, and V4 respectively.
+
+       VGFMAG  CONST_R2R1, V1, V5, V1
+       VGFMAG  CONST_R2R1, V2, V6, V2
+       VGFMAG  CONST_R2R1, V3, V7, V3
+       VGFMAG  CONST_R2R1, V4, V8 ,V4
+
+       // Adjust buffer pointer and length for next loop
+       ADD     $64, R3                  // BUF = BUF + 64
+       ADD     $(-64), R4               // LEN = LEN - 64
+
+       CMP     R4, $64
+       BGE     fold_64bytes_loop
+
+less_than_64bytes:
+       // Fold V1 to V4 into a single 128-bit value in V1
+       VGFMAG  CONST_R4R3, V1, V2, V1
+       VGFMAG  CONST_R4R3, V1, V3, V1
+       VGFMAG  CONST_R4R3, V1, V4, V1
+
+       // Check whether to continue with 64-bit folding
+       CMP R4, $16
+       BLT final_fold
+
+fold_16bytes_loop:
+       VL      0(R3), V2               // Load next data chunk
+       VPERM   V2, V2, CONST_PERM_LE2BE, V2
+
+       VGFMAG  CONST_R4R3, V1, V2, V1  // Fold next data chunk
+
+       // Adjust buffer pointer and size for folding next data chunk
+       ADD     $16, R3
+       ADD     $-16, R4
+
+       // Process remaining data chunks
+       CMP     R4 ,$16
+       BGE     fold_16bytes_loop
+
+final_fold:
+       VLEIB   $7, $0x40, V9
+       VSRLB   V9, CONST_R4R3, V0
+       VLEIG   $0, $1, V0
+
+       VGFMG   V0, V1, V1
+
+       VLEIB   $7, $0x20, V9         // Shift by words
+       VSRLB   V9, V1, V2            // Store remaining bits in V2
+       VUPLLF  V1, V1                // Split rightmost doubleword
+       VGFMAG  CONST_R5, V1, V2, V1  // V1 = (V1 * R5) XOR V2
+
+
+       // The input values to the Barret reduction are the degree-63 polynomial
+       // in V1 (R(x)), degree-32 generator polynomial, and the reduction
+       // constant u.  The Barret reduction result is the CRC value of R(x) mod
+       // P(x).
+       //
+       // The Barret reduction algorithm is defined as:
+       //
+       //    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+       //    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+       //    3. C(x)  = R(x) XOR T2(x) mod x^32
+       //
+       // Note: To compensate the division by x^32, use the vector unpack
+       // instruction to move the leftmost word into the leftmost doubleword
+       // of the vector register.  The rightmost doubleword is multiplied
+       // with zero to not contribute to the intermedate results.
+
+
+       // T1(x) = floor( R(x) / x^32 ) GF2MUL u
+       VUPLLF  V1, V2
+       VGFMG   CONST_RU_POLY, V2, V2
+
+
+       // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+       // V2 and XOR the intermediate result, T2(x),  with the value in V1.
+       // The final result is in the rightmost word of V2.
+
+       VUPLLF  V2 , V2
+       VGFMAG  CONST_CRC_POLY, V2, V1, V2
+
+done:
+       VLGVF   $2, V2, R2
+       XOR     $0xffffffff, R2 // NOTW R2
+       MOVWZ   R2, ret + 32(FP)
+       RET
author	Chris Zou <chriszou@ca.ibm.com>
	Mon, 18 Apr 2016 23:30:17 +0000 (19:30 -0400)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Fri, 22 Apr 2016 18:07:15 +0000 (18:07 +0000)
src/hash/crc32/crc32_generic.go		patch \| blob \| history
src/hash/crc32/crc32_s390x.go	[new file with mode: 0644]	patch \| blob
src/hash/crc32/crc32_s390x.s	[new file with mode: 0644]	patch \| blob