#define INP R4
#define END R5
#define TBL R6
-#define IDX R7
#define CNT R8
#define LEN R9
-#define OFFLOAD R11
#define TEMP R12
-#define HEX00 R0
-#define HEX10 R10
-#define HEX20 R25
-#define HEX30 R26
+#define TBL_STRT R7 // Pointer to start of kcon table.
+
+#define R_x000 R0
+#define R_x010 R10
+#define R_x020 R25
+#define R_x030 R26
+#define R_x040 R14
+#define R_x050 R15
+#define R_x060 R16
+#define R_x070 R17
+#define R_x080 R18
+#define R_x090 R19
+#define R_x0a0 R20
+#define R_x0b0 R21
+#define R_x0c0 R22
+#define R_x0d0 R23
+#define R_x0e0 R24
+#define R_x0f0 R28
+#define R_x100 R29
+#define R_x110 R27
+
// V0-V7 are A-H
// V8-V23 are used for the message schedule
DATA ·kcon+0x518(SB)/8, $0x0001020304050607
GLOBL ·kcon(SB), RODATA, $1312
-#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \
+#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi, idx) \
VSEL g, f, e, FUNC; \
VSHASIGMAD $15, e, $1, S1; \
VADDUDM xi, h, h; \
VADDUDM KI, g, g; \
VADDUDM h, d, d; \
VADDUDM FUNC, S0, S0; \
- LVX (TBL)(IDX), KI; \
- ADD $16, IDX; \
+ LVX (TBL)(idx), KI; \
VADDUDM S0, h, h
-#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
+#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14, idx) \
VSHASIGMAD $0, xj_1, $0, s0; \
VSEL g, f, e, FUNC; \
VSHASIGMAD $15, e, $1, S1; \
VADDUDM h, d, d; \
VADDUDM FUNC, S0, S0; \
VADDUDM s0, xj, xj; \
- LVX (TBL)(IDX), KI; \
- ADD $16, IDX; \
+ LVX (TBL)(idx), KI; \
VADDUDM S0, h, h; \
VADDUDM s1, xj, xj
CMP INP, END
BEQ end
- MOVD $·kcon(SB), TBL
- MOVD R1, OFFLOAD
+ MOVD $·kcon(SB), TBL_STRT
MOVD R0, CNT
- MOVWZ $0x10, HEX10
- MOVWZ $0x20, HEX20
- MOVWZ $0x30, HEX30
+ MOVWZ $0x010, R_x010
+ MOVWZ $0x020, R_x020
+ MOVWZ $0x030, R_x030
+ MOVD $0x040, R_x040
+ MOVD $0x050, R_x050
+ MOVD $0x060, R_x060
+ MOVD $0x070, R_x070
+ MOVD $0x080, R_x080
+ MOVD $0x090, R_x090
+ MOVD $0x0a0, R_x0a0
+ MOVD $0x0b0, R_x0b0
+ MOVD $0x0c0, R_x0c0
+ MOVD $0x0d0, R_x0d0
+ MOVD $0x0e0, R_x0e0
+ MOVD $0x0f0, R_x0f0
+ MOVD $0x100, R_x100
+ MOVD $0x110, R_x110
-// Generate the mask used with VPERM for LE
#ifdef GOARCH_ppc64le
- MOVWZ $8, IDX
- LVSL (IDX)(R0), LEMASK
+ // Generate the mask used with VPERM for LE
+ MOVWZ $8, TEMP
+ LVSL (TEMP)(R0), LEMASK
VSPLTISB $0x0F, KI
VXOR KI, LEMASK, LEMASK
#endif
- LXVD2X (CTX)(HEX00), VS32 // v0 = vs32
- LXVD2X (CTX)(HEX10), VS34 // v2 = vs34
- LXVD2X (CTX)(HEX20), VS36 // v4 = vs36
+ LXVD2X (CTX)(R_x000), VS32 // v0 = vs32
+ LXVD2X (CTX)(R_x010), VS34 // v2 = vs34
+ LXVD2X (CTX)(R_x020), VS36 // v4 = vs36
+
// unpack the input values into vector registers
VSLDOI $8, V0, V0, V1
- LXVD2X (CTX)(HEX30), VS38 // v6 = vs38
+ LXVD2X (CTX)(R_x030), VS38 // v6 = vs38
VSLDOI $8, V2, V2, V3
VSLDOI $8, V4, V4, V5
VSLDOI $8, V6, V6, V7
loop:
- LVX (TBL)(HEX00), KI
- MOVWZ $16, IDX
+ MOVD TBL_STRT, TBL
+ LVX (TBL)(R_x000), KI
LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance
ADD $16, INP
XXLOR V7, V7, VS31
VADDUDM KI, V7, V7 // h+K[i]
- LVX (TBL)(IDX), KI
- ADD $16, IDX
+ LVX (TBL)(R_x010), KI
VPERMLE(V8,V8,LEMASK,V8)
- SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
- LXVD2X (INP)(R0), VS42 // load v10 (=vs42) in advance
- ADD $16, INP, INP
+ SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8, R_x020)
+ LXVD2X (INP)(R_x000), VS42 // load v10 (=vs42) in advance
VSLDOI $8, V8, V8, V9
- SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
+ SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9, R_x030)
VPERMLE(V10,V10,LEMASK,V10)
- SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
- LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance
- ADD $16, INP, INP
+ SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10, R_x040)
+ LXVD2X (INP)(R_x010), VS44 // load v12 (=vs44) in advance
VSLDOI $8, V10, V10, V11
- SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
+ SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11, R_x050)
VPERMLE(V12,V12,LEMASK,V12)
- SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
- LXVD2X (INP)(R0), VS46 // load v14 (=vs46) in advance
- ADD $16, INP, INP
+ SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12, R_x060)
+ LXVD2X (INP)(R_x020), VS46 // load v14 (=vs46) in advance
VSLDOI $8, V12, V12, V13
- SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
+ SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13, R_x070)
VPERMLE(V14,V14,LEMASK,V14)
- SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
- LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance
- ADD $16, INP, INP
+ SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14, R_x080)
+ LXVD2X (INP)(R_x030), VS48 // load v16 (=vs48) in advance
VSLDOI $8, V14, V14, V15
- SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
+ SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15, R_x090)
VPERMLE(V16,V16,LEMASK,V16)
- SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
- LXVD2X (INP)(R0), VS50 // load v18 (=vs50) in advance
- ADD $16, INP, INP
+ SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16, R_x0a0)
+ LXVD2X (INP)(R_x040), VS50 // load v18 (=vs50) in advance
VSLDOI $8, V16, V16, V17
- SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
+ SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17, R_x0b0)
VPERMLE(V18,V18,LEMASK,V18)
- SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
- LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance
- ADD $16, INP, INP
+ SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18, R_x0c0)
+ LXVD2X (INP)(R_x050), VS52 // load v20 (=vs52) in advance
VSLDOI $8, V18, V18, V19
- SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
+ SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19, R_x0d0)
VPERMLE(V20,V20,LEMASK,V20)
- SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
- LXVD2X (INP)(R0), VS54 // load v22 (=vs54) in advance
- ADD $16, INP, INP
+ SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20, R_x0e0)
+ LXVD2X (INP)(R_x060), VS54 // load v22 (=vs54) in advance
VSLDOI $8, V20, V20, V21
- SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
+ SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21, R_x0f0)
VPERMLE(V22,V22,LEMASK,V22)
- SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
+ SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22, R_x100)
VSLDOI $8, V22, V22, V23
- SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
+ SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x110)
MOVWZ $4, TEMP
MOVWZ TEMP, CTR
+ ADD $0x120, TBL
+ ADD $0x70, INP
L16_xx:
- SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
- SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
- SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
- SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
- SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
- SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
- SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
- SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
- SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
- SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
- SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
- SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
- SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
- SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
- SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
- SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
-
- BC 0x10, 0, L16_xx // bdnz
+ SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23, R_x000)
+ SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8, R_x010)
+ SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9, R_x020)
+ SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10, R_x030)
+ SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11, R_x040)
+ SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12, R_x050)
+ SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13, R_x060)
+ SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14, R_x070)
+ SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15, R_x080)
+ SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16, R_x090)
+ SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17, R_x0a0)
+ SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18, R_x0b0)
+ SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19, R_x0c0)
+ SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20, R_x0d0)
+ SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21, R_x0e0)
+ SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22, R_x0f0)
+ ADD $0x100, TBL
+
+ BDNZ L16_xx
XXLOR VS24, VS24, V10
XXLOR VS25, VS25, V11
VPERM V5, V4, KI, V4
VPERM V7, V6, KI, V6
#endif
- STXVD2X VS32, (CTX+HEX00) // v0 = vs32
- STXVD2X VS34, (CTX+HEX10) // v2 = vs34
- STXVD2X VS36, (CTX+HEX20) // v4 = vs36
- STXVD2X VS38, (CTX+HEX30) // v6 = vs38
+ STXVD2X VS32, (CTX+R_x000) // v0 = vs32
+ STXVD2X VS34, (CTX+R_x010) // v2 = vs34
+ STXVD2X VS36, (CTX+R_x020) // v4 = vs36
+ STXVD2X VS38, (CTX+R_x030) // v6 = vs38
end:
RET