From d24c65d8bfb24b7064d49ec49b418ef273dc17f2 Mon Sep 17 00:00:00 2001 From: "Paul E. Murphy" Date: Tue, 3 May 2022 08:33:42 -0500 Subject: [PATCH] cmd/internal/notsha256: revert PPC64 removal, and fix PPC64 asm This reverts commit a41e37f56a4fc2523ac88a76bf54ba3e45dcf533, and updates the ASM usage to be go1.8 compliant. go 1.18 added support for using VR's in place of VSR arguments. The following transformations are made: XXLOR Vx, Vx, VSy -> XXLORQ VSx+32, VSx+32, VSy XXLOR VSx, VSx, Vy -> XXLORQ VSx, VSx, VSy+32 XXLOR is broken on 1.8, but XXLORQ is identical and still supported today. Change-Id: Icc9cd5511b412c30a14e6afd07a51839aaaf6021 Reviewed-on: https://go-review.googlesource.com/c/go/+/403734 Reviewed-by: Bryan Mills TryBot-Result: Gopher Robot Run-TryBot: Paul Murphy Reviewed-by: Lynn Boger Reviewed-by: David Chase --- .../internal/notsha256/sha256block_decl.go | 4 +- .../internal/notsha256/sha256block_generic.go | 4 +- .../internal/notsha256/sha256block_ppc64x.s | 430 ++++++++++++++++++ 3 files changed, 434 insertions(+), 4 deletions(-) create mode 100644 src/cmd/internal/notsha256/sha256block_ppc64x.s diff --git a/src/cmd/internal/notsha256/sha256block_decl.go b/src/cmd/internal/notsha256/sha256block_decl.go index b1dedf4f7e..5a822ee479 100644 --- a/src/cmd/internal/notsha256/sha256block_decl.go +++ b/src/cmd/internal/notsha256/sha256block_decl.go @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build 386 || amd64 -// +build 386 amd64 +//go:build 386 || amd64 || ppc64le || ppc64 +// +build 386 amd64 ppc64le ppc64 package notsha256 diff --git a/src/cmd/internal/notsha256/sha256block_generic.go b/src/cmd/internal/notsha256/sha256block_generic.go index 7197f2aac4..20ae841383 100644 --- a/src/cmd/internal/notsha256/sha256block_generic.go +++ b/src/cmd/internal/notsha256/sha256block_generic.go @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !amd64 && !386 -// +build !amd64,!386 +//go:build !amd64 && !386 && !ppc64le && !ppc64 +// +build !amd64,!386,!ppc64le,!ppc64 package notsha256 diff --git a/src/cmd/internal/notsha256/sha256block_ppc64x.s b/src/cmd/internal/notsha256/sha256block_ppc64x.s new file mode 100644 index 0000000000..6e0f1d6133 --- /dev/null +++ b/src/cmd/internal/notsha256/sha256block_ppc64x.s @@ -0,0 +1,430 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +// WARNING: this file is built by the bootstrap compiler, thus +// it must maintain compatibility with the oldest supported +// bootstrap toolchain. +// + +//go:build ppc64 || ppc64le +// +build ppc64 ppc64le + +// Based on CRYPTOGAMS code with the following comment: +// # ==================================================================== +// # Written by Andy Polyakov for the OpenSSL +// # project. The module is, however, dual licensed under OpenSSL and +// # CRYPTOGAMS licenses depending on where you obtain it. For further +// # details see http://www.openssl.org/~appro/cryptogams/. +// # ==================================================================== + +#include "textflag.h" + +// SHA256 block routine. See sha256block.go for Go equivalent. +// +// The algorithm is detailed in FIPS 180-4: +// +// https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf +// +// Wt = Mt; for 0 <= t <= 15 +// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 63 +// +// a = H0 +// b = H1 +// c = H2 +// d = H3 +// e = H4 +// f = H5 +// g = H6 +// h = H7 +// +// for t = 0 to 63 { +// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt +// T2 = BIGSIGMA0(a) + Maj(a,b,c) +// h = g +// g = f +// f = e +// e = d + T1 +// d = c +// c = b +// b = a +// a = T1 + T2 +// } +// +// H0 = a + H0 +// H1 = b + H1 +// H2 = c + H2 +// H3 = d + H3 +// H4 = e + H4 +// H5 = f + H5 +// H6 = g + H6 +// H7 = h + H7 + +#define CTX R3 +#define INP R4 +#define END R5 +#define TBL R6 +#define IDX R7 +#define LEN R9 +#define TEMP R12 + +#define HEX00 R0 +#define HEX10 R10 + +// V0-V7 are A-H +// V8-V23 are used for the message schedule +#define KI V24 +#define FUNC V25 +#define S0 V26 +#define S1 V27 +#define s0 V28 +#define s1 V29 +#define LEMASK V31 // Permutation control register for little endian + +// 4 copies of each Kt, to fill all 4 words of a vector register +DATA ·kcon+0x000(SB)/8, $0x428a2f98428a2f98 +DATA ·kcon+0x008(SB)/8, $0x428a2f98428a2f98 +DATA ·kcon+0x010(SB)/8, $0x7137449171374491 +DATA ·kcon+0x018(SB)/8, $0x7137449171374491 +DATA ·kcon+0x020(SB)/8, $0xb5c0fbcfb5c0fbcf +DATA ·kcon+0x028(SB)/8, $0xb5c0fbcfb5c0fbcf +DATA ·kcon+0x030(SB)/8, $0xe9b5dba5e9b5dba5 +DATA ·kcon+0x038(SB)/8, $0xe9b5dba5e9b5dba5 +DATA ·kcon+0x040(SB)/8, $0x3956c25b3956c25b +DATA ·kcon+0x048(SB)/8, $0x3956c25b3956c25b +DATA ·kcon+0x050(SB)/8, $0x59f111f159f111f1 +DATA ·kcon+0x058(SB)/8, $0x59f111f159f111f1 +DATA ·kcon+0x060(SB)/8, $0x923f82a4923f82a4 +DATA ·kcon+0x068(SB)/8, $0x923f82a4923f82a4 +DATA ·kcon+0x070(SB)/8, $0xab1c5ed5ab1c5ed5 +DATA ·kcon+0x078(SB)/8, $0xab1c5ed5ab1c5ed5 +DATA ·kcon+0x080(SB)/8, $0xd807aa98d807aa98 +DATA ·kcon+0x088(SB)/8, $0xd807aa98d807aa98 +DATA ·kcon+0x090(SB)/8, $0x12835b0112835b01 +DATA ·kcon+0x098(SB)/8, $0x12835b0112835b01 +DATA ·kcon+0x0A0(SB)/8, $0x243185be243185be +DATA ·kcon+0x0A8(SB)/8, $0x243185be243185be +DATA ·kcon+0x0B0(SB)/8, $0x550c7dc3550c7dc3 +DATA ·kcon+0x0B8(SB)/8, $0x550c7dc3550c7dc3 +DATA ·kcon+0x0C0(SB)/8, $0x72be5d7472be5d74 +DATA ·kcon+0x0C8(SB)/8, $0x72be5d7472be5d74 +DATA ·kcon+0x0D0(SB)/8, $0x80deb1fe80deb1fe +DATA ·kcon+0x0D8(SB)/8, $0x80deb1fe80deb1fe +DATA ·kcon+0x0E0(SB)/8, $0x9bdc06a79bdc06a7 +DATA ·kcon+0x0E8(SB)/8, $0x9bdc06a79bdc06a7 +DATA ·kcon+0x0F0(SB)/8, $0xc19bf174c19bf174 +DATA ·kcon+0x0F8(SB)/8, $0xc19bf174c19bf174 +DATA ·kcon+0x100(SB)/8, $0xe49b69c1e49b69c1 +DATA ·kcon+0x108(SB)/8, $0xe49b69c1e49b69c1 +DATA ·kcon+0x110(SB)/8, $0xefbe4786efbe4786 +DATA ·kcon+0x118(SB)/8, $0xefbe4786efbe4786 +DATA ·kcon+0x120(SB)/8, $0x0fc19dc60fc19dc6 +DATA ·kcon+0x128(SB)/8, $0x0fc19dc60fc19dc6 +DATA ·kcon+0x130(SB)/8, $0x240ca1cc240ca1cc +DATA ·kcon+0x138(SB)/8, $0x240ca1cc240ca1cc +DATA ·kcon+0x140(SB)/8, $0x2de92c6f2de92c6f +DATA ·kcon+0x148(SB)/8, $0x2de92c6f2de92c6f +DATA ·kcon+0x150(SB)/8, $0x4a7484aa4a7484aa +DATA ·kcon+0x158(SB)/8, $0x4a7484aa4a7484aa +DATA ·kcon+0x160(SB)/8, $0x5cb0a9dc5cb0a9dc +DATA ·kcon+0x168(SB)/8, $0x5cb0a9dc5cb0a9dc +DATA ·kcon+0x170(SB)/8, $0x76f988da76f988da +DATA ·kcon+0x178(SB)/8, $0x76f988da76f988da +DATA ·kcon+0x180(SB)/8, $0x983e5152983e5152 +DATA ·kcon+0x188(SB)/8, $0x983e5152983e5152 +DATA ·kcon+0x190(SB)/8, $0xa831c66da831c66d +DATA ·kcon+0x198(SB)/8, $0xa831c66da831c66d +DATA ·kcon+0x1A0(SB)/8, $0xb00327c8b00327c8 +DATA ·kcon+0x1A8(SB)/8, $0xb00327c8b00327c8 +DATA ·kcon+0x1B0(SB)/8, $0xbf597fc7bf597fc7 +DATA ·kcon+0x1B8(SB)/8, $0xbf597fc7bf597fc7 +DATA ·kcon+0x1C0(SB)/8, $0xc6e00bf3c6e00bf3 +DATA ·kcon+0x1C8(SB)/8, $0xc6e00bf3c6e00bf3 +DATA ·kcon+0x1D0(SB)/8, $0xd5a79147d5a79147 +DATA ·kcon+0x1D8(SB)/8, $0xd5a79147d5a79147 +DATA ·kcon+0x1E0(SB)/8, $0x06ca635106ca6351 +DATA ·kcon+0x1E8(SB)/8, $0x06ca635106ca6351 +DATA ·kcon+0x1F0(SB)/8, $0x1429296714292967 +DATA ·kcon+0x1F8(SB)/8, $0x1429296714292967 +DATA ·kcon+0x200(SB)/8, $0x27b70a8527b70a85 +DATA ·kcon+0x208(SB)/8, $0x27b70a8527b70a85 +DATA ·kcon+0x210(SB)/8, $0x2e1b21382e1b2138 +DATA ·kcon+0x218(SB)/8, $0x2e1b21382e1b2138 +DATA ·kcon+0x220(SB)/8, $0x4d2c6dfc4d2c6dfc +DATA ·kcon+0x228(SB)/8, $0x4d2c6dfc4d2c6dfc +DATA ·kcon+0x230(SB)/8, $0x53380d1353380d13 +DATA ·kcon+0x238(SB)/8, $0x53380d1353380d13 +DATA ·kcon+0x240(SB)/8, $0x650a7354650a7354 +DATA ·kcon+0x248(SB)/8, $0x650a7354650a7354 +DATA ·kcon+0x250(SB)/8, $0x766a0abb766a0abb +DATA ·kcon+0x258(SB)/8, $0x766a0abb766a0abb +DATA ·kcon+0x260(SB)/8, $0x81c2c92e81c2c92e +DATA ·kcon+0x268(SB)/8, $0x81c2c92e81c2c92e +DATA ·kcon+0x270(SB)/8, $0x92722c8592722c85 +DATA ·kcon+0x278(SB)/8, $0x92722c8592722c85 +DATA ·kcon+0x280(SB)/8, $0xa2bfe8a1a2bfe8a1 +DATA ·kcon+0x288(SB)/8, $0xa2bfe8a1a2bfe8a1 +DATA ·kcon+0x290(SB)/8, $0xa81a664ba81a664b +DATA ·kcon+0x298(SB)/8, $0xa81a664ba81a664b +DATA ·kcon+0x2A0(SB)/8, $0xc24b8b70c24b8b70 +DATA ·kcon+0x2A8(SB)/8, $0xc24b8b70c24b8b70 +DATA ·kcon+0x2B0(SB)/8, $0xc76c51a3c76c51a3 +DATA ·kcon+0x2B8(SB)/8, $0xc76c51a3c76c51a3 +DATA ·kcon+0x2C0(SB)/8, $0xd192e819d192e819 +DATA ·kcon+0x2C8(SB)/8, $0xd192e819d192e819 +DATA ·kcon+0x2D0(SB)/8, $0xd6990624d6990624 +DATA ·kcon+0x2D8(SB)/8, $0xd6990624d6990624 +DATA ·kcon+0x2E0(SB)/8, $0xf40e3585f40e3585 +DATA ·kcon+0x2E8(SB)/8, $0xf40e3585f40e3585 +DATA ·kcon+0x2F0(SB)/8, $0x106aa070106aa070 +DATA ·kcon+0x2F8(SB)/8, $0x106aa070106aa070 +DATA ·kcon+0x300(SB)/8, $0x19a4c11619a4c116 +DATA ·kcon+0x308(SB)/8, $0x19a4c11619a4c116 +DATA ·kcon+0x310(SB)/8, $0x1e376c081e376c08 +DATA ·kcon+0x318(SB)/8, $0x1e376c081e376c08 +DATA ·kcon+0x320(SB)/8, $0x2748774c2748774c +DATA ·kcon+0x328(SB)/8, $0x2748774c2748774c +DATA ·kcon+0x330(SB)/8, $0x34b0bcb534b0bcb5 +DATA ·kcon+0x338(SB)/8, $0x34b0bcb534b0bcb5 +DATA ·kcon+0x340(SB)/8, $0x391c0cb3391c0cb3 +DATA ·kcon+0x348(SB)/8, $0x391c0cb3391c0cb3 +DATA ·kcon+0x350(SB)/8, $0x4ed8aa4a4ed8aa4a +DATA ·kcon+0x358(SB)/8, $0x4ed8aa4a4ed8aa4a +DATA ·kcon+0x360(SB)/8, $0x5b9cca4f5b9cca4f +DATA ·kcon+0x368(SB)/8, $0x5b9cca4f5b9cca4f +DATA ·kcon+0x370(SB)/8, $0x682e6ff3682e6ff3 +DATA ·kcon+0x378(SB)/8, $0x682e6ff3682e6ff3 +DATA ·kcon+0x380(SB)/8, $0x748f82ee748f82ee +DATA ·kcon+0x388(SB)/8, $0x748f82ee748f82ee +DATA ·kcon+0x390(SB)/8, $0x78a5636f78a5636f +DATA ·kcon+0x398(SB)/8, $0x78a5636f78a5636f +DATA ·kcon+0x3A0(SB)/8, $0x84c8781484c87814 +DATA ·kcon+0x3A8(SB)/8, $0x84c8781484c87814 +DATA ·kcon+0x3B0(SB)/8, $0x8cc702088cc70208 +DATA ·kcon+0x3B8(SB)/8, $0x8cc702088cc70208 +DATA ·kcon+0x3C0(SB)/8, $0x90befffa90befffa +DATA ·kcon+0x3C8(SB)/8, $0x90befffa90befffa +DATA ·kcon+0x3D0(SB)/8, $0xa4506ceba4506ceb +DATA ·kcon+0x3D8(SB)/8, $0xa4506ceba4506ceb +DATA ·kcon+0x3E0(SB)/8, $0xbef9a3f7bef9a3f7 +DATA ·kcon+0x3E8(SB)/8, $0xbef9a3f7bef9a3f7 +DATA ·kcon+0x3F0(SB)/8, $0xc67178f2c67178f2 +DATA ·kcon+0x3F8(SB)/8, $0xc67178f2c67178f2 +DATA ·kcon+0x400(SB)/8, $0x0000000000000000 +DATA ·kcon+0x408(SB)/8, $0x0000000000000000 + +#ifdef GOARCH_ppc64le +DATA ·kcon+0x410(SB)/8, $0x1011121310111213 // permutation control vectors +DATA ·kcon+0x418(SB)/8, $0x1011121300010203 +DATA ·kcon+0x420(SB)/8, $0x1011121310111213 +DATA ·kcon+0x428(SB)/8, $0x0405060700010203 +DATA ·kcon+0x430(SB)/8, $0x1011121308090a0b +DATA ·kcon+0x438(SB)/8, $0x0405060700010203 +#else +DATA ·kcon+0x410(SB)/8, $0x1011121300010203 +DATA ·kcon+0x418(SB)/8, $0x1011121310111213 // permutation control vectors +DATA ·kcon+0x420(SB)/8, $0x0405060700010203 +DATA ·kcon+0x428(SB)/8, $0x1011121310111213 +DATA ·kcon+0x430(SB)/8, $0x0001020304050607 +DATA ·kcon+0x438(SB)/8, $0x08090a0b10111213 +#endif + +GLOBL ·kcon(SB), RODATA, $1088 + +#define SHA256ROUND0(a, b, c, d, e, f, g, h, xi) \ + VSEL g, f, e, FUNC; \ + VSHASIGMAW $15, e, $1, S1; \ + VADDUWM xi, h, h; \ + VSHASIGMAW $0, a, $1, S0; \ + VADDUWM FUNC, h, h; \ + VXOR b, a, FUNC; \ + VADDUWM S1, h, h; \ + VSEL b, c, FUNC, FUNC; \ + VADDUWM KI, g, g; \ + VADDUWM h, d, d; \ + VADDUWM FUNC, S0, S0; \ + LVX (TBL)(IDX), KI; \ + ADD $16, IDX; \ + VADDUWM S0, h, h + +#define SHA256ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \ + VSHASIGMAW $0, xj_1, $0, s0; \ + VSEL g, f, e, FUNC; \ + VSHASIGMAW $15, e, $1, S1; \ + VADDUWM xi, h, h; \ + VSHASIGMAW $0, a, $1, S0; \ + VSHASIGMAW $15, xj_14, $0, s1; \ + VADDUWM FUNC, h, h; \ + VXOR b, a, FUNC; \ + VADDUWM xj_9, xj, xj; \ + VADDUWM S1, h, h; \ + VSEL b, c, FUNC, FUNC; \ + VADDUWM KI, g, g; \ + VADDUWM h, d, d; \ + VADDUWM FUNC, S0, S0; \ + VADDUWM s0, xj, xj; \ + LVX (TBL)(IDX), KI; \ + ADD $16, IDX; \ + VADDUWM S0, h, h; \ + VADDUWM s1, xj, xj + +#ifdef GOARCH_ppc64le +#define VPERMLE(va,vb,vc,vt) VPERM va, vb, vc, vt +#else +#define VPERMLE(va,vb,vc,vt) +#endif + +// func block(dig *digest, p []byte) +TEXT ·block(SB),0,$0-32 + MOVD dig+0(FP), CTX + MOVD p_base+8(FP), INP + MOVD p_len+16(FP), LEN + + SRD $6, LEN + SLD $6, LEN + ADD INP, LEN, END + + CMP INP, END + BEQ end + + MOVD $·kcon(SB), TBL + MOVWZ $0x10, HEX10 + MOVWZ $8, IDX + +#ifdef GOARCH_ppc64le + LVSL (IDX)(R0), LEMASK + VSPLTISB $0x0F, KI + VXOR KI, LEMASK, LEMASK +#endif + + LXVW4X (CTX)(HEX00), VS32 // v0 = vs32 + LXVW4X (CTX)(HEX10), VS36 // v4 = vs36 + + // unpack the input values into vector registers + VSLDOI $4, V0, V0, V1 + VSLDOI $8, V0, V0, V2 + VSLDOI $12, V0, V0, V3 + VSLDOI $4, V4, V4, V5 + VSLDOI $8, V4, V4, V6 + VSLDOI $12, V4, V4, V7 + +loop: + LVX (TBL)(HEX00), KI + MOVWZ $16, IDX + + LXVD2X (INP)(R0), VS40 // load v8 (=vs40) in advance + ADD $16, INP + + // Offload to VSR24-31 (aka FPR24-31) + XXLORQ VS32, VS32, VS24 + XXLORQ VS33, VS33, VS25 + XXLORQ VS34, VS34, VS26 + XXLORQ VS35, VS35, VS27 + XXLORQ VS36, VS36, VS28 + XXLORQ VS37, VS37, VS29 + XXLORQ VS38, VS38, VS30 + XXLORQ VS39, VS39, VS31 + + VADDUWM KI, V7, V7 // h+K[i] + LVX (TBL)(IDX), KI + ADD $16, IDX + + VPERMLE(V8, V8, LEMASK, V8) + SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8) + VSLDOI $4, V8, V8, V9 + SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9) + VSLDOI $4, V9, V9, V10 + SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10) + LXVD2X (INP)(R0), VS44 // load v12 (=vs44) in advance + ADD $16, INP, INP + VSLDOI $4, V10, V10, V11 + SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11) + VPERMLE(V12, V12, LEMASK, V12) + SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12) + VSLDOI $4, V12, V12, V13 + SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13) + VSLDOI $4, V13, V13, V14 + SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14) + LXVD2X (INP)(R0), VS48 // load v16 (=vs48) in advance + ADD $16, INP, INP + VSLDOI $4, V14, V14, V15 + SHA256ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15) + VPERMLE(V16, V16, LEMASK, V16) + SHA256ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16) + VSLDOI $4, V16, V16, V17 + SHA256ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17) + VSLDOI $4, V17, V17, V18 + SHA256ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18) + VSLDOI $4, V18, V18, V19 + LXVD2X (INP)(R0), VS52 // load v20 (=vs52) in advance + ADD $16, INP, INP + SHA256ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19) + VPERMLE(V20, V20, LEMASK, V20) + SHA256ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20) + VSLDOI $4, V20, V20, V21 + SHA256ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21) + VSLDOI $4, V21, V21, V22 + SHA256ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22) + VSLDOI $4, V22, V22, V23 + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) + + MOVWZ $3, TEMP + MOVWZ TEMP, CTR + +L16_xx: + SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23) + SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8) + SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9) + SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10) + SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11) + SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12) + SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14) + SHA256ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15) + SHA256ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16) + SHA256ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17) + SHA256ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18) + SHA256ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19) + SHA256ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20) + SHA256ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21) + SHA256ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22) + + BC 0x10, 0, L16_xx // bdnz + + XXLORQ VS24, VS24, VS42 + + XXLORQ VS25, VS25, VS43 + VADDUWM V10, V0, V0 + XXLORQ VS26, VS26, VS44 + VADDUWM V11, V1, V1 + XXLORQ VS27, VS27, VS45 + VADDUWM V12, V2, V2 + XXLORQ VS28, VS28, VS46 + VADDUWM V13, V3, V3 + XXLORQ VS29, VS29, VS47 + VADDUWM V14, V4, V4 + XXLORQ VS30, VS30, VS48 + VADDUWM V15, V5, V5 + XXLORQ VS31, VS31, VS49 + VADDUWM V16, V6, V6 + VADDUWM V17, V7, V7 + + CMPU INP, END + BLT loop + + LVX (TBL)(IDX), V8 + ADD $16, IDX + VPERM V0, V1, KI, V0 + LVX (TBL)(IDX), V9 + VPERM V4, V5, KI, V4 + VPERM V0, V2, V8, V0 + VPERM V4, V6, V8, V4 + VPERM V0, V3, V9, V0 + VPERM V4, V7, V9, V4 + STXVD2X VS32, (CTX+HEX00) // v0 = vs32 + STXVD2X VS36, (CTX+HEX10) // v4 = vs36 + +end: + RET + -- 2.50.0