// For size >= 4KB, XC is loop unrolled 16 times (4KB = 256B * 16)
clearge4KB:
XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
- XC $256, 0(R4), 0(R4)
- ADD $256, R4
- ADD $-256, R5
+ XC $256, 256(R4), 256(R4)
+ XC $256, 512(R4), 512(R4)
+ XC $256, 768(R4), 768(R4)
+ XC $256, 1024(R4), 1024(R4)
+ XC $256, 1280(R4), 1280(R4)
+ XC $256, 1536(R4), 1536(R4)
+ XC $256, 1792(R4), 1792(R4)
+ XC $256, 2048(R4), 2048(R4)
+ XC $256, 2304(R4), 2304(R4)
+ XC $256, 2560(R4), 2560(R4)
+ XC $256, 2816(R4), 2816(R4)
+ XC $256, 3072(R4), 3072(R4)
+ XC $256, 3328(R4), 3328(R4)
+ XC $256, 3584(R4), 3584(R4)
+ XC $256, 3840(R4), 3840(R4)
+ ADD $4096, R4
+ ADD $-4096, R5
CMP R5, $4096
BGE clearge4KB
clear32:
VZERO V1
VST V1, 0(R4)
- VST V1, 16(R4)
+ VST V1, 16(R4)
RET
clear33to64: