MOVD b+16(FP), R5 // R5 = b
MOVD n+24(FP), R6 // R6 = n
- CMPU R6, $32, CR7 // Check if n ≥ 32 bytes
+ CMPU R6, $64, CR7 // Check if n ≥ 64 bytes
MOVD R0, R8 // R8 = index
- CMPU R6, $8, CR6 // Check if 8 ≤ n < 32 bytes
- BLT CR6, small // Smaller than 8
- BLT CR7, xor16 // Case for 16 ≤ n < 32 bytes
+ CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes
+ BLE CR6, small // <= 8
+ BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes
- // Case for n ≥ 32 bytes
-preloop32:
- SRD $5, R6, R7 // Setup loop counter
+ // Case for n ≥ 64 bytes
+preloop64:
+ SRD $6, R6, R7 // Set up loop counter
MOVD R7, CTR
MOVD $16, R10
- ANDCC $31, R6, R9 // Check for tailing bytes for later
-loop32:
- LXVD2X (R4)(R8), VS32 // VS32 = a[i,...,i+15]
- LXVD2X (R4)(R10), VS34
- LXVD2X (R5)(R8), VS33 // VS33 = b[i,...,i+15]
- LXVD2X (R5)(R10), VS35
- XXLXOR VS32, VS33, VS32 // VS34 = a[] ^ b[]
- XXLXOR VS34, VS35, VS34
- STXVD2X VS32, (R3)(R8) // Store to dst
- STXVD2X VS34, (R3)(R10)
- ADD $32, R8 // Update index
- ADD $32, R10
- BC 16, 0, loop32 // bdnz loop16
-
- BEQ CR0, done
-
- MOVD R9, R6
- CMP R6, $8
- BLT small
+ MOVD $32, R14
+ MOVD $48, R15
+ ANDCC $63, R6, R9 // Check for tailing bytes for later
+ PCALIGN $16
+ // Case for >= 64 bytes
+ // Process 64 bytes per iteration
+ // Load 4 vectors of a and b
+ // XOR the corresponding vectors
+ // from a and b and store the result
+loop64:
+ LXVD2X (R4)(R8), VS32
+ LXVD2X (R4)(R10), VS34
+ LXVD2X (R4)(R14), VS36
+ LXVD2X (R4)(R15), VS38
+ LXVD2X (R5)(R8), VS33
+ LXVD2X (R5)(R10), VS35
+ LXVD2X (R5)(R14), VS37
+ LXVD2X (R5)(R15), VS39
+ XXLXOR VS32, VS33, VS32
+ XXLXOR VS34, VS35, VS34
+ XXLXOR VS36, VS37, VS36
+ XXLXOR VS38, VS39, VS38
+ STXVD2X VS32, (R3)(R8)
+ STXVD2X VS34, (R3)(R10)
+ STXVD2X VS36, (R3)(R14)
+ STXVD2X VS38, (R3)(R15)
+ ADD $64, R8
+ ADD $64, R10
+ ADD $64, R14
+ ADD $64, R15
+ BDNZ loop64
+ BC 12,2,LR // BEQLR
+ MOVD R9, R6
+ CMP R6, $8
+ BLE small
+ // Case for 8 <= n < 64 bytes
+ // Process 32 bytes if available
+xor32:
+ CMP R6, $32
+ BLT xor16
+ ADD $16, R8, R9
+ LXVD2X (R4)(R8), VS32
+ LXVD2X (R4)(R9), VS33
+ LXVD2X (R5)(R8), VS34
+ LXVD2X (R5)(R9), VS35
+ XXLXOR VS32, VS34, VS32
+ XXLXOR VS33, VS35, VS33
+ STXVD2X VS32, (R3)(R8)
+ STXVD2X VS33, (R3)(R9)
+ ADD $32, R8
+ ADD $-32, R6
+ CMP R6, $8
+ BLE small
+ // Case for 8 <= n < 32 bytes
+ // Process 16 bytes if available
xor16:
- CMP R6, $16
- BLT xor8
- LXVD2X (R4)(R8), VS32
- LXVD2X (R5)(R8), VS33
- XXLXOR VS32, VS33, VS32
- STXVD2X VS32, (R3)(R8)
- ADD $16, R8
- ADD $-16, R6
- CMP R6, $8
- BLT small
+ CMP R6, $16
+ BLT xor8
+ LXVD2X (R4)(R8), VS32
+ LXVD2X (R5)(R8), VS33
+ XXLXOR VS32, VS33, VS32
+ STXVD2X VS32, (R3)(R8)
+ ADD $16, R8
+ ADD $-16, R6
+small:
+ CMP R6, R0
+ BC 12,2,LR // BEQLR
xor8:
- // Case for 8 ≤ n < 16 bytes
- MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
- MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
- XOR R14, R15, R16 // R16 = a[] ^ b[]
- SUB $8, R6 // n = n - 8
- MOVD R16, (R3)(R8) // Store to dst
- ADD $8, R8
-
- // Check if we're finished
- CMP R6, R0
- BGT small
+#ifdef GOPPC64_power10
+ SLD $56,R6,R17
+ ADD R4,R8,R18
+ ADD R5,R8,R19
+ ADD R3,R8,R20
+ LXVL R18,R17,V0
+ LXVL R19,R17,V1
+ VXOR V0,V1,V1
+ STXVL V1,R20,R17
RET
-
- // Case for n < 8 bytes and tailing bytes from the
- // previous cases.
-small:
+#else
+ CMP R6, $8
+ BLT xor4
+ // Case for 8 ≤ n < 16 bytes
+ MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
+ MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
+ XOR R14, R15, R16 // R16 = a[] ^ b[]
+ SUB $8, R6 // n = n - 8
+ MOVD R16, (R3)(R8) // Store to dst
+ ADD $8, R8
+xor4:
+ CMP R6, $4
+ BLT xor2
+ MOVWZ (R4)(R8), R14
+ MOVWZ (R5)(R8), R15
+ XOR R14, R15, R16
+ MOVW R16, (R3)(R8)
+ ADD $4,R8
+ ADD $-4,R6
+xor2:
+ CMP R6, $2
+ BLT xor1
+ MOVHZ (R4)(R8), R14
+ MOVHZ (R5)(R8), R15
+ XOR R14, R15, R16
+ MOVH R16, (R3)(R8)
+ ADD $2,R8
+ ADD $-2,R6
+xor1:
CMP R6, R0
- BEQ done
- MOVD R6, CTR // Setup loop counter
-
-loop:
+ BC 12,2,LR // BEQLR
MOVBZ (R4)(R8), R14 // R14 = a[i]
MOVBZ (R5)(R8), R15 // R15 = b[i]
XOR R14, R15, R16 // R16 = a[i] ^ b[i]
MOVB R16, (R3)(R8) // Store to dst
- ADD $1, R8
- BC 16, 0, loop // bdnz loop
-
+#endif
done:
RET