#define CTX R3
#define INP R4
#define END R5
-#define TBL R6
-#define IDX R7
+#define TBL R6 // Pointer into kcon table
#define LEN R9
#define TEMP R12
-#define HEX00 R0
-#define HEX10 R10
+#define TBL_STRT R7 // Pointer to start of kcon table.
+
+#define R_x000 R0
+#define R_x010 R8
+#define R_x020 R10
+#define R_x030 R11
+#define R_x040 R14
+#define R_x050 R15
+#define R_x060 R16
+#define R_x070 R17
+#define R_x080 R18
+#define R_x090 R19
+#define R_x0a0 R20
+#define R_x0b0 R21
+#define R_x0c0 R22
+#define R_x0d0 R23
+#define R_x0e0 R24
+#define R_x0f0 R25
+#define R_x100 R26
+#define R_x110 R27
+
// V0-V7 are A-H
// V8-V23 are used for the message schedule
#define S1 V27
#define s0 V28
#define s1 V29
-#define LEMASK V31 // Permutation control register for little endian
+#define LEMASK V31 // Permutation control register for little endian
// 4 copies of each Kt, to fill all 4 words of a vector register
DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
DATA ·kcon+0x408(SB)/8, $0x0000000000000000
#ifdef GOARCH_ppc64le
-DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
+DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
DATA ·kcon+0x418(SB)/8, $0x1011121300010203
DATA ·kcon+0x420(SB)/8, $0x1011121310111213
DATA ·kcon+0x428(SB)/8, $0x0405060700010203
DATA ·kcon+0x438(SB)/8, $0x0405060700010203
#else
DATA ·kcon+0x410(SB)/8, $0x1011121300010203
-DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
+DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
DATA ·kcon+0x420(SB)/8, $0x0405060700010203
DATA ·kcon+0x428(SB)/8, $0x1011121310111213
DATA ·kcon+0x430(SB)/8, $0x0001020304050607
GLOBL ·kcon(SB), RODATA, $1088
-#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \
+#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
VSEL g, f, e, FUNC; \
VSHASIGMAW $15, e, $1, S1; \
VADDUWM xi, h, h; \
VADDUWM KI, g, g; \
VADDUWM h, d, d; \
VADDUWM FUNC, S0, S0; \
- LVX (TBL)(IDX), KI; \
- ADD $16, IDX; \
+ LVX (TBL)(idx), KI; \
VADDUWM S0, h, h
-#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
+#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
VSHASIGMAW $0, xj_1, $0, s0; \
VSEL g, f, e, FUNC; \
VSHASIGMAW $15, e, $1, S1; \
VADDUWM h, d, d; \
VADDUWM FUNC, S0, S0; \
VADDUWM s0, xj, xj; \
- LVX (TBL)(IDX), KI; \
- ADD $16, IDX; \
+ LVX (TBL)(idx), KI; \
VADDUWM S0, h, h; \
VADDUWM s1, xj, xj
CMP INP, END
BEQ end
- MOVD $·kcon(SB), TBL
- MOVWZ $0x10, HEX10
- MOVWZ $8, IDX
+ MOVD $·kcon(SB), TBL_STRT
+ MOVD $0x10, R_x010
#ifdef GOARCH_ppc64le
- LVSL (IDX)(R0), LEMASK
+ MOVWZ $8, TEMP
+ LVSL (TEMP)(R0), LEMASK
VSPLTISB $0x0F, KI
VXOR KI, LEMASK, LEMASK
#endif
- LXVW4X (CTX)(HEX00), VS32 // v0 = vs32
- LXVW4X (CTX)(HEX10), VS36 // v4 = vs36
+ LXVW4X (CTX)(R_x000), V0
+ LXVW4X (CTX)(R_x010), V4
// unpack the input values into vector registers
VSLDOI $4, V0, V0, V1
VSLDOI $8, V4, V4, V6
VSLDOI $12, V4, V4, V7
+ MOVD $0x020, R_x020
+ MOVD $0x030, R_x030
+ MOVD $0x040, R_x040
+ MOVD $0x050, R_x050
+ MOVD $0x060, R_x060
+ MOVD $0x070, R_x070
+ MOVD $0x080, R_x080
+ MOVD $0x090, R_x090
+ MOVD $0x0a0, R_x0a0
+ MOVD $0x0b0, R_x0b0
+ MOVD $0x0c0, R_x0c0
+ MOVD $0x0d0, R_x0d0
+ MOVD $0x0e0, R_x0e0
+ MOVD $0x0f0, R_x0f0
+ MOVD $0x100, R_x100
+ MOVD $0x110, R_x110
+
loop:
- LVX (TBL)(HEX00), KI
- MOVWZ $16, IDX
+ MOVD TBL_STRT, TBL
+ LVX (TBL)(R_x000), KI
- LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance
- ADD $16, INP
+ LXVD2X (INP)(R_x000), V8 // load v8 in advance
// Offload to VSR24-31 (aka FPR24-31)
- XXLORQ VS32, VS32, VS24
- XXLORQ VS33, VS33, VS25
- XXLORQ VS34, VS34, VS26
- XXLORQ VS35, VS35, VS27
- XXLORQ VS36, VS36, VS28
- XXLORQ VS37, VS37, VS29
- XXLORQ VS38, VS38, VS30
- XXLORQ VS39, VS39, VS31
-
- VADDUWM KI, V7, V7 // h+K[i]
- LVX (TBL)(IDX), KI
- ADD $16, IDX
+ XXLOR V0, V0, VS24
+ XXLOR V1, V1, VS25
+ XXLOR V2, V2, VS26
+ XXLOR V3, V3, VS27
+ XXLOR V4, V4, VS28
+ XXLOR V5, V5, VS29
+ XXLOR V6, V6, VS30
+ XXLOR V7, V7, VS31
+
+ VADDUWM KI, V7, V7 // h+K[i]
+ LVX (TBL)(R_x010), KI
VPERMLE(V8, V8, LEMASK, V8)
- SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
+ SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
VSLDOI $4, V8, V8, V9
- SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
+ SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
VSLDOI $4, V9, V9, V10
- SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
- LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance
- ADD $16, INP, INP
+ SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
+ LXVD2X (INP)(R_x010), V12 // load v12 in advance
VSLDOI $4, V10, V10, V11
- SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
+ SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
VPERMLE(V12, V12, LEMASK, V12)
- SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
+ SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
VSLDOI $4, V12, V12, V13
- SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
+ SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
VSLDOI $4, V13, V13, V14
- SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
- LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance
- ADD $16, INP, INP
+ SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
+ LXVD2X (INP)(R_x020), V16 // load v16 in advance
VSLDOI $4, V14, V14, V15
- SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
+ SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
VPERMLE(V16, V16, LEMASK, V16)
- SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
+ SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
VSLDOI $4, V16, V16, V17
- SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
+ SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
VSLDOI $4, V17, V17, V18
- SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
+ SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
VSLDOI $4, V18, V18, V19
- LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance
- ADD $16, INP, INP
- SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
+ LXVD2X (INP)(R_x030), V20 // load v20 in advance
+ SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
VPERMLE(V20, V20, LEMASK, V20)
- SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
+ SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
VSLDOI $4, V20, V20, V21
- SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
+ SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
VSLDOI $4, V21, V21, V22
- SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
+ SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
VSLDOI $4, V22, V22, V23
- SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
+ SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
- MOVWZ $3, TEMP
- MOVWZ TEMP, CTR
+ MOVD $3, TEMP
+ MOVD TEMP, CTR
+ ADD $0x120, TBL
+ ADD $0x40, INP
L16_xx:
- SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
- SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
- SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
- SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
- SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
- SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
- SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
- SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
- SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
- SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
- SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
- SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
- SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
- SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
- SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
- SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
-
- BC 0x10, 0, L16_xx // bdnz
-
- XXLORQ VS24, VS24, VS42
-
- XXLORQ VS25, VS25, VS43
+ SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
+ SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
+ SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
+ SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
+ SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
+ SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
+ SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
+ SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
+ SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
+ SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
+ SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
+ SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
+ SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
+ SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
+ SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
+ SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
+ ADD $0x100, TBL
+
+ BDNZ L16_xx
+
+ XXLOR VS24, VS24, V10
+
+ XXLOR VS25, VS25, V11
VADDUWM V10, V0, V0
- XXLORQ VS26, VS26, VS44
+ XXLOR VS26, VS26, V12
VADDUWM V11, V1, V1
- XXLORQ VS27, VS27, VS45
+ XXLOR VS27, VS27, V13
VADDUWM V12, V2, V2
- XXLORQ VS28, VS28, VS46
+ XXLOR VS28, VS28, V14
VADDUWM V13, V3, V3
- XXLORQ VS29, VS29, VS47
+ XXLOR VS29, VS29, V15
VADDUWM V14, V4, V4
- XXLORQ VS30, VS30, VS48
+ XXLOR VS30, VS30, V16
VADDUWM V15, V5, V5
- XXLORQ VS31, VS31, VS49
+ XXLOR VS31, VS31, V17
VADDUWM V16, V6, V6
VADDUWM V17, V7, V7
CMPU INP, END
BLT loop
- LVX (TBL)(IDX), V8
- ADD $16, IDX
+ LVX (TBL)(R_x000), V8
VPERM V0, V1, KI, V0
- LVX (TBL)(IDX), V9
+ LVX (TBL)(R_x010), V9
VPERM V4, V5, KI, V4
VPERM V0, V2, V8, V0
VPERM V4, V6, V8, V4
VPERM V0, V3, V9, V0
VPERM V4, V7, V9, V4
- STXVD2X VS32, (CTX+HEX00) // v0 = vs32
- STXVD2X VS36, (CTX+HEX10) // v4 = vs36
+ STXVD2X V0, (CTX+R_x000)
+ STXVD2X V4, (CTX+R_x010)
end:
RET
#define CTX R3
#define INP R4
#define END R5
-#define TBL R6
-#define IDX R7
+#define TBL R6 // Pointer into kcon table
#define LEN R9
#define TEMP R12
-#define HEX00 R0
-#define HEX10 R10
+#define TBL_STRT R7 // Pointer to start of kcon table.
+
+#define R_x000 R0
+#define R_x010 R8
+#define R_x020 R10
+#define R_x030 R11
+#define R_x040 R14
+#define R_x050 R15
+#define R_x060 R16
+#define R_x070 R17
+#define R_x080 R18
+#define R_x090 R19
+#define R_x0a0 R20
+#define R_x0b0 R21
+#define R_x0c0 R22
+#define R_x0d0 R23
+#define R_x0e0 R24
+#define R_x0f0 R25
+#define R_x100 R26
+#define R_x110 R27
+
// V0-V7 are A-H
// V8-V23 are used for the message schedule
#define S1 V27
#define s0 V28
#define s1 V29
-#define LEMASK V31 // Permutation control register for little endian
+#define LEMASK V31 // Permutation control register for little endian
// 4 copies of each Kt, to fill all 4 words of a vector register
DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98
DATA ·kcon+0x408(SB)/8, $0x0000000000000000
#ifdef GOARCH_ppc64le
-DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
+DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors
DATA ·kcon+0x418(SB)/8, $0x1011121300010203
DATA ·kcon+0x420(SB)/8, $0x1011121310111213
DATA ·kcon+0x428(SB)/8, $0x0405060700010203
DATA ·kcon+0x438(SB)/8, $0x0405060700010203
#else
DATA ·kcon+0x410(SB)/8, $0x1011121300010203
-DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
+DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors
DATA ·kcon+0x420(SB)/8, $0x0405060700010203
DATA ·kcon+0x428(SB)/8, $0x1011121310111213
DATA ·kcon+0x430(SB)/8, $0x0001020304050607
GLOBL ·kcon(SB), RODATA, $1088
-#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \
+#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
VSEL g, f, e, FUNC; \
VSHASIGMAW $15, e, $1, S1; \
VADDUWM xi, h, h; \
VADDUWM KI, g, g; \
VADDUWM h, d, d; \
VADDUWM FUNC, S0, S0; \
- LVX (TBL)(IDX), KI; \
- ADD $16, IDX; \
+ LVX (TBL)(idx), KI; \
VADDUWM S0, h, h
-#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
+#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
VSHASIGMAW $0, xj_1, $0, s0; \
VSEL g, f, e, FUNC; \
VSHASIGMAW $15, e, $1, S1; \
VADDUWM h, d, d; \
VADDUWM FUNC, S0, S0; \
VADDUWM s0, xj, xj; \
- LVX (TBL)(IDX), KI; \
- ADD $16, IDX; \
+ LVX (TBL)(idx), KI; \
VADDUWM S0, h, h; \
VADDUWM s1, xj, xj
CMP INP, END
BEQ end
- MOVD $·kcon(SB), TBL
- MOVWZ $0x10, HEX10
- MOVWZ $8, IDX
+ MOVD $·kcon(SB), TBL_STRT
+ MOVD $0x10, R_x010
#ifdef GOARCH_ppc64le
- LVSL (IDX)(R0), LEMASK
+ MOVWZ $8, TEMP
+ LVSL (TEMP)(R0), LEMASK
VSPLTISB $0x0F, KI
VXOR KI, LEMASK, LEMASK
#endif
- LXVW4X (CTX)(HEX00), VS32 // v0 = vs32
- LXVW4X (CTX)(HEX10), VS36 // v4 = vs36
+ LXVW4X (CTX)(R_x000), V0
+ LXVW4X (CTX)(R_x010), V4
// unpack the input values into vector registers
VSLDOI $4, V0, V0, V1
VSLDOI $8, V4, V4, V6
VSLDOI $12, V4, V4, V7
+ MOVD $0x020, R_x020
+ MOVD $0x030, R_x030
+ MOVD $0x040, R_x040
+ MOVD $0x050, R_x050
+ MOVD $0x060, R_x060
+ MOVD $0x070, R_x070
+ MOVD $0x080, R_x080
+ MOVD $0x090, R_x090
+ MOVD $0x0a0, R_x0a0
+ MOVD $0x0b0, R_x0b0
+ MOVD $0x0c0, R_x0c0
+ MOVD $0x0d0, R_x0d0
+ MOVD $0x0e0, R_x0e0
+ MOVD $0x0f0, R_x0f0
+ MOVD $0x100, R_x100
+ MOVD $0x110, R_x110
+
loop:
- LVX (TBL)(HEX00), KI
- MOVWZ $16, IDX
+ MOVD TBL_STRT, TBL
+ LVX (TBL)(R_x000), KI
- LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance
- ADD $16, INP
+ LXVD2X (INP)(R_x000), V8 // load v8 in advance
// Offload to VSR24-31 (aka FPR24-31)
XXLOR V0, V0, VS24
XXLOR V6, V6, VS30
XXLOR V7, V7, VS31
- VADDUWM KI, V7, V7 // h+K[i]
- LVX (TBL)(IDX), KI
- ADD $16, IDX
+ VADDUWM KI, V7, V7 // h+K[i]
+ LVX (TBL)(R_x010), KI
VPERMLE(V8, V8, LEMASK, V8)
- SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
+ SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
VSLDOI $4, V8, V8, V9
- SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
+ SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
VSLDOI $4, V9, V9, V10
- SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
- LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance
- ADD $16, INP, INP
+ SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
+ LXVD2X (INP)(R_x010), V12 // load v12 in advance
VSLDOI $4, V10, V10, V11
- SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
+ SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
VPERMLE(V12, V12, LEMASK, V12)
- SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
+ SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
VSLDOI $4, V12, V12, V13
- SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
+ SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
VSLDOI $4, V13, V13, V14
- SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
- LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance
- ADD $16, INP, INP
+ SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
+ LXVD2X (INP)(R_x020), V16 // load v16 in advance
VSLDOI $4, V14, V14, V15
- SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
+ SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
VPERMLE(V16, V16, LEMASK, V16)
- SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
+ SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
VSLDOI $4, V16, V16, V17
- SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
+ SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
VSLDOI $4, V17, V17, V18
- SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
+ SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
VSLDOI $4, V18, V18, V19
- LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance
- ADD $16, INP, INP
- SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
+ LXVD2X (INP)(R_x030), V20 // load v20 in advance
+ SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
VPERMLE(V20, V20, LEMASK, V20)
- SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
+ SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
VSLDOI $4, V20, V20, V21
- SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
+ SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
VSLDOI $4, V21, V21, V22
- SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
+ SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
VSLDOI $4, V22, V22, V23
- SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
+ SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
- MOVWZ $3, TEMP
- MOVWZ TEMP, CTR
+ MOVD $3, TEMP
+ MOVD TEMP, CTR
+ ADD $0x120, TBL
+ ADD $0x40, INP
L16_xx:
- SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
- SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
- SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
- SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
- SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
- SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
- SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
- SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
- SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
- SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
- SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
- SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
- SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
- SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
- SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
- SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
-
- BC 0x10, 0, L16_xx // bdnz
+ SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
+ SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
+ SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
+ SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
+ SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
+ SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
+ SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
+ SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
+ SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
+ SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
+ SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
+ SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
+ SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
+ SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
+ SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
+ SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
+ ADD $0x100, TBL
+
+ BDNZ L16_xx
XXLOR VS24, VS24, V10
CMPU INP, END
BLT loop
- LVX (TBL)(IDX), V8
- ADD $16, IDX
+ LVX (TBL)(R_x000), V8
VPERM V0, V1, KI, V0
- LVX (TBL)(IDX), V9
+ LVX (TBL)(R_x010), V9
VPERM V4, V5, KI, V4
VPERM V0, V2, V8, V0
VPERM V4, V6, V8, V4
VPERM V0, V3, V9, V0
VPERM V4, V7, V9, V4
- STXVD2X VS32, (CTX+HEX00) // v0 = vs32
- STXVD2X VS36, (CTX+HEX10) // v4 = vs36
+ STXVD2X V0, (CTX+R_x000)
+ STXVD2X V4, (CTX+R_x010)
end:
RET