AMOVNTHD
AMOVHDA
AVPCMPEQB
+ AVPXOR
AVPMOVMSKB
AVPAND
AVPTEST
"MOVNTHD",
"MOVHDA",
"VPCMPEQB",
+ "VPXOR",
"VPMOVMSKB",
"VPAND",
"VPTEST",
{AMOVNTHD, yxr_ml_vex, Pvex1, [23]uint8{0xe7}},
{AMOVHDA, yxmov_vex, Pvex1, [23]uint8{0x6f, 0x7f}},
{AVPCMPEQB, yxm_xm_xm, Pvex1, [23]uint8{0x74, 0x74}},
+ {AVPXOR, yxm_xm_xm, Pvex1, [23]uint8{0xef, 0xef}},
{AVPMOVMSKB, ymskb_vex, Pvex1, [23]uint8{0xd7}},
{AVPAND, yxm_xm_xm, Pvex1, [23]uint8{0xdb, 0xdb}},
{AVPBROADCASTB, yml_xr_vex, Pvex3, [23]uint8{0x78, 0x78}},
JBE _65through128
CMPQ BX, $256
JBE _129through256
+ CMPB runtime·support_avx2(SB), $1
+ JE loop_preheader_avx2
// TODO: use branch table and BSR to make this just a single dispatch
- // TODO: for really big clears, use MOVNTDQ.
+ // TODO: for really big clears, use MOVNTDQ, even without AVX2.
loop:
MOVOU X0, 0(DI)
JAE loop
JMP tail
+loop_preheader_avx2:
+ VPXOR X0, X0, X0
+ // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
+ // For larger sizes it is always faster, even on dual Xeons with 30M cache.
+ // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
+ CMPQ BX, $0x2000000
+ JAE loop_preheader_avx2_huge
+loop_avx2:
+ MOVHDU X0, 0(DI)
+ MOVHDU X0, 32(DI)
+ MOVHDU X0, 64(DI)
+ MOVHDU X0, 96(DI)
+ SUBQ $128, BX
+ ADDQ $128, DI
+ CMPQ BX, $128
+ JAE loop_avx2
+ MOVHDU X0, -32(DI)(BX*1)
+ MOVHDU X0, -64(DI)(BX*1)
+ MOVHDU X0, -96(DI)(BX*1)
+ MOVHDU X0, -128(DI)(BX*1)
+ VZEROUPPER
+ RET
+loop_preheader_avx2_huge:
+ // Align to 32 byte boundary
+ MOVHDU X0, 0(DI)
+ MOVQ DI, SI
+ ADDQ $32, DI
+ ANDQ $~31, DI
+ SUBQ DI, SI
+ ADDQ SI, BX
+loop_avx2_huge:
+ MOVNTHD X0, 0(DI)
+ MOVNTHD X0, 32(DI)
+ MOVNTHD X0, 64(DI)
+ MOVNTHD X0, 96(DI)
+ SUBQ $128, BX
+ ADDQ $128, DI
+ CMPQ BX, $128
+ JAE loop_avx2_huge
+ // In the desciption of MOVNTDQ in [1]
+ // "... fencing operation implemented with the SFENCE or MFENCE instruction
+ // should be used in conjunction with MOVNTDQ instructions..."
+ // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+ SFENCE
+ MOVHDU X0, -32(DI)(BX*1)
+ MOVHDU X0, -64(DI)(BX*1)
+ MOVHDU X0, -96(DI)(BX*1)
+ MOVHDU X0, -128(DI)(BX*1)
+ VZEROUPPER
+ RET
+
_1or2:
MOVB AX, (DI)
MOVB AX, -1(DI)(BX*1)
func BenchmarkMemclr256(b *testing.B) { bmMemclr(b, 256) }
func BenchmarkMemclr4096(b *testing.B) { bmMemclr(b, 4096) }
func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) }
+func BenchmarkMemclr1M(b *testing.B) { bmMemclr(b, 1<<20) }
+func BenchmarkMemclr4M(b *testing.B) { bmMemclr(b, 4<<20) }
+func BenchmarkMemclr8M(b *testing.B) { bmMemclr(b, 8<<20) }
+func BenchmarkMemclr16M(b *testing.B) { bmMemclr(b, 16<<20) }
+func BenchmarkMemclr64M(b *testing.B) { bmMemclr(b, 64<<20) }
func bmGoMemclr(b *testing.B, n int) {
x := make([]byte, n)