NOTL AX
- /* If there's less than 8 bytes to process, we do it byte-by-byte. */
+ // If there are fewer than 8 bytes to process, skip alignment.
CMPQ CX, $8
- JL cleanup
+ JL less_than_8
- /* Process individual bytes until the input is 8-byte aligned. */
-startup:
MOVQ SI, BX
ANDQ $7, BX
JZ aligned
+ // Process the first few bytes to 8-byte align the input.
+
+ // BX = 8 - BX. We need to process this many bytes to align.
+ SUBQ $1, BX
+ XORQ $7, BX
+
+ BTQ $0, BX
+ JNC align_2
+
CRC32B (SI), AX
DECQ CX
INCQ SI
- JMP startup
+
+align_2:
+ BTQ $1, BX
+ JNC align_4
+
+ // CRC32W (SI), AX
+ BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+ SUBQ $2, CX
+ ADDQ $2, SI
+
+align_4:
+ BTQ $2, BX
+ JNC aligned
+
+ // CRC32L (SI), AX
+ BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+
+ SUBQ $4, CX
+ ADDQ $4, SI
aligned:
- /* The input is now 8-byte aligned and we can process 8-byte chunks. */
+ // The input is now 8-byte aligned and we can process 8-byte chunks.
CMPQ CX, $8
- JL cleanup
+ JL less_than_8
CRC32Q (SI), AX
ADDQ $8, SI
SUBQ $8, CX
JMP aligned
-cleanup:
- /* We may have some bytes left over that we process one at a time. */
- CMPQ CX, $0
- JE done
+less_than_8:
+ // We may have some bytes left over; process 4 bytes, then 2, then 1.
+ BTQ $2, CX
+ JNC less_than_4
+
+ // CRC32L (SI), AX
+ BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+ ADDQ $4, SI
+
+less_than_4:
+ BTQ $1, CX
+ JNC less_than_2
+
+ // CRC32W (SI), AX
+ BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
+ ADDQ $2, SI
+
+less_than_2:
+ BTQ $0, CX
+ JNC done
CRC32B (SI), AX
- INCQ SI
- DECQ CX
- JMP cleanup
done:
NOTL AX
t.Errorf("Castagnoli(%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
}
- if len(g.in) > 0 {
- // The SSE4.2 implementation of this has code to deal
- // with misaligned data so we ensure that we test that
- // too.
- castagnoli = New(castagnoliTab)
- io.WriteString(castagnoli, g.in[:1])
- io.WriteString(castagnoli, g.in[1:])
- s = castagnoli.Sum32()
- if s != g.castagnoli {
- t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
+ // The SSE4.2 implementation of this has code to deal
+ // with misaligned data so we ensure that we test that
+ // too.
+ for delta := 1; delta <= 7; delta++ {
+ if len(g.in) > delta {
+ in := []byte(g.in)
+ castagnoli = New(castagnoliTab)
+ castagnoli.Write(in[:delta])
+ castagnoli.Write(in[delta:])
+ s = castagnoli.Sum32()
+ if s != g.castagnoli {
+ t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
+ }
}
}
}
}
func BenchmarkIEEECrc40B(b *testing.B) {
- benchmark(b, NewIEEE(), 40)
+ benchmark(b, NewIEEE(), 40, 0)
}
func BenchmarkIEEECrc1KB(b *testing.B) {
- benchmark(b, NewIEEE(), 1<<10)
+ benchmark(b, NewIEEE(), 1<<10, 0)
}
func BenchmarkIEEECrc4KB(b *testing.B) {
- benchmark(b, NewIEEE(), 4<<10)
+ benchmark(b, NewIEEE(), 4<<10, 0)
}
func BenchmarkIEEECrc32KB(b *testing.B) {
- benchmark(b, NewIEEE(), 32<<10)
+ benchmark(b, NewIEEE(), 32<<10, 0)
+}
+
+func BenchmarkCastagnoliCrc15B(b *testing.B) {
+ benchmark(b, New(MakeTable(Castagnoli)), 15, 0)
+}
+
+func BenchmarkCastagnoliCrc15BMisaligned(b *testing.B) {
+ benchmark(b, New(MakeTable(Castagnoli)), 15, 1)
}
func BenchmarkCastagnoliCrc40B(b *testing.B) {
- benchmark(b, New(MakeTable(Castagnoli)), 40)
+ benchmark(b, New(MakeTable(Castagnoli)), 40, 0)
}
func BenchmarkCastagnoliCrc1KB(b *testing.B) {
- benchmark(b, New(MakeTable(Castagnoli)), 1<<10)
+ benchmark(b, New(MakeTable(Castagnoli)), 1<<10, 0)
}
func BenchmarkCastagnoliCrc4KB(b *testing.B) {
- benchmark(b, New(MakeTable(Castagnoli)), 4<<10)
+ benchmark(b, New(MakeTable(Castagnoli)), 4<<10, 0)
}
func BenchmarkCastagnoliCrc32KB(b *testing.B) {
- benchmark(b, New(MakeTable(Castagnoli)), 32<<10)
+ benchmark(b, New(MakeTable(Castagnoli)), 32<<10, 0)
}
-func benchmark(b *testing.B, h hash.Hash32, n int64) {
+func benchmark(b *testing.B, h hash.Hash32, n, alignment int64) {
b.SetBytes(n)
- data := make([]byte, n)
+ data := make([]byte, n+alignment)
+ data = data[alignment:]
for i := range data {
data[i] = byte(i)
}