hash/crc32: improve the processing of the last bytes in the SSE4.2 code for AMD64

author Radu Berinde <radu@cockroachlabs.com>

Tue, 16 Aug 2016 12:05:39 +0000 (08:05 -0400)

committer Keith Randall <khr@golang.org>

Wed, 17 Aug 2016 21:20:50 +0000 (21:20 +0000)
author Radu Berinde <radu@cockroachlabs.com>
Tue, 16 Aug 2016 12:05:39 +0000 (08:05 -0400)
committer Keith Randall <khr@golang.org>
Wed, 17 Aug 2016 21:20:50 +0000 (21:20 +0000)
diff --git a/src/hash/crc32/crc32_amd64.s b/src/hash/crc32/crc32_amd64.s

index caacfae21db68cfab09292b3f31b17e1ee45bb21..a775a194df8a2a0f095795d747a3f7bc2955dfe7 100644 (file)
--- a/src/hash/crc32/crc32_amd64.s
+++ b/src/hash/crc32/crc32_amd64.s
@@ -12,40 +12,79 @@ TEXT ·castagnoliSSE42(SB),NOSPLIT,$0
  
         NOTL AX
  
-       /* If there's less than 8 bytes to process, we do it byte-by-byte. */
+       // If there are fewer than 8 bytes to process, skip alignment.
         CMPQ CX, $8
-       JL cleanup
+       JL less_than_8
  
-       /* Process individual bytes until the input is 8-byte aligned. */
-startup:
         MOVQ SI, BX
         ANDQ $7, BX
         JZ aligned
  
+       // Process the first few bytes to 8-byte align the input.
+
+       // BX = 8 - BX. We need to process this many bytes to align.
+       SUBQ $1, BX
+       XORQ $7, BX
+
+       BTQ $0, BX
+       JNC align_2
+
         CRC32B (SI), AX
         DECQ CX
         INCQ SI
-       JMP startup
+
+align_2:
+       BTQ $1, BX
+       JNC align_4
+
+       // CRC32W (SI), AX
+       BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+       SUBQ $2, CX
+       ADDQ $2, SI
+
+align_4:
+       BTQ $2, BX
+       JNC aligned
+
+       // CRC32L (SI), AX
+       BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+       SUBQ $4, CX
+       ADDQ $4, SI
  
  aligned:
-       /* The input is now 8-byte aligned and we can process 8-byte chunks. */
+       // The input is now 8-byte aligned and we can process 8-byte chunks.
         CMPQ CX, $8
-       JL cleanup
+       JL less_than_8
  
         CRC32Q (SI), AX
         ADDQ $8, SI
         SUBQ $8, CX
         JMP aligned
  
-cleanup:
-       /* We may have some bytes left over that we process one at a time. */
-       CMPQ CX, $0
-       JE done
+less_than_8:
+       // We may have some bytes left over; process 4 bytes, then 2, then 1.
+       BTQ $2, CX
+       JNC less_than_4
+
+       // CRC32L (SI), AX
+       BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+       ADDQ $4, SI
+
+less_than_4:
+       BTQ $1, CX
+       JNC less_than_2
+
+       // CRC32W (SI), AX
+       BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+       ADDQ $2, SI
+
+less_than_2:
+       BTQ $0, CX
+       JNC done
  
         CRC32B (SI), AX
-       INCQ SI
-       DECQ CX
-       JMP cleanup
  
  done:
         NOTL AX
diff --git a/src/hash/crc32/crc32_test.go b/src/hash/crc32/crc32_test.go

index e2b3557828c2eef58544a14b9e964534f7349c2c..067c42adf05213c92bfdc88857bc0e32e47b8436 100644 (file)
--- a/src/hash/crc32/crc32_test.go
+++ b/src/hash/crc32/crc32_test.go
@@ -67,56 +67,68 @@ func TestGolden(t *testing.T) {
                         t.Errorf("Castagnoli(%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
                 }
  
-               if len(g.in) > 0 {
-                       // The SSE4.2 implementation of this has code to deal
-                       // with misaligned data so we ensure that we test that
-                       // too.
-                       castagnoli = New(castagnoliTab)
-                       io.WriteString(castagnoli, g.in[:1])
-                       io.WriteString(castagnoli, g.in[1:])
-                       s = castagnoli.Sum32()
-                       if s != g.castagnoli {
-                               t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
+               // The SSE4.2 implementation of this has code to deal
+               // with misaligned data so we ensure that we test that
+               // too.
+               for delta := 1; delta <= 7; delta++ {
+                       if len(g.in) > delta {
+                               in := []byte(g.in)
+                               castagnoli = New(castagnoliTab)
+                               castagnoli.Write(in[:delta])
+                               castagnoli.Write(in[delta:])
+                               s = castagnoli.Sum32()
+                               if s != g.castagnoli {
+                                       t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
+                               }
                         }
                 }
         }
  }
  
  func BenchmarkIEEECrc40B(b *testing.B) {
-       benchmark(b, NewIEEE(), 40)
+       benchmark(b, NewIEEE(), 40, 0)
  }
  
  func BenchmarkIEEECrc1KB(b *testing.B) {
-       benchmark(b, NewIEEE(), 1<<10)
+       benchmark(b, NewIEEE(), 1<<10, 0)
  }
  
  func BenchmarkIEEECrc4KB(b *testing.B) {
-       benchmark(b, NewIEEE(), 4<<10)
+       benchmark(b, NewIEEE(), 4<<10, 0)
  }
  
  func BenchmarkIEEECrc32KB(b *testing.B) {
-       benchmark(b, NewIEEE(), 32<<10)
+       benchmark(b, NewIEEE(), 32<<10, 0)
+}
+
+func BenchmarkCastagnoliCrc15B(b *testing.B) {
+       benchmark(b, New(MakeTable(Castagnoli)), 15, 0)
+}
+
+func BenchmarkCastagnoliCrc15BMisaligned(b *testing.B) {
+       benchmark(b, New(MakeTable(Castagnoli)), 15, 1)
  }
  
  func BenchmarkCastagnoliCrc40B(b *testing.B) {
-       benchmark(b, New(MakeTable(Castagnoli)), 40)
+       benchmark(b, New(MakeTable(Castagnoli)), 40, 0)
  }
  
  func BenchmarkCastagnoliCrc1KB(b *testing.B) {
-       benchmark(b, New(MakeTable(Castagnoli)), 1<<10)
+       benchmark(b, New(MakeTable(Castagnoli)), 1<<10, 0)
  }
  
  func BenchmarkCastagnoliCrc4KB(b *testing.B) {
-       benchmark(b, New(MakeTable(Castagnoli)), 4<<10)
+       benchmark(b, New(MakeTable(Castagnoli)), 4<<10, 0)
  }
  
  func BenchmarkCastagnoliCrc32KB(b *testing.B) {
-       benchmark(b, New(MakeTable(Castagnoli)), 32<<10)
+       benchmark(b, New(MakeTable(Castagnoli)), 32<<10, 0)
  }
  
-func benchmark(b *testing.B, h hash.Hash32, n int64) {
+func benchmark(b *testing.B, h hash.Hash32, n, alignment int64) {
         b.SetBytes(n)
-       data := make([]byte, n)
+       data := make([]byte, n+alignment)
+       data = data[alignment:]
         for i := range data {
                 data[i] = byte(i)
         }
author	Radu Berinde <radu@cockroachlabs.com>
	Tue, 16 Aug 2016 12:05:39 +0000 (08:05 -0400)
committer	Keith Randall <khr@golang.org>
	Wed, 17 Aug 2016 21:20:50 +0000 (21:20 +0000)
src/hash/crc32/crc32_amd64.s		patch \| blob \| history
src/hash/crc32/crc32_test.go		patch \| blob \| history