--- /dev/null
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64le
+
+package aes
+
+import (
+ "crypto/cipher"
+ "crypto/subtle"
+ "encoding/binary"
+ "errors"
+)
+
+// This file implements GCM using an optimized GHASH function.
+
+//go:noescape
+func gcmInit(productTable *[256]byte, h []byte)
+
+//go:noescape
+func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
+
+//go:noescape
+func gcmMul(output []byte, productTable *[256]byte)
+
+const (
+ gcmCounterSize = 16
+ gcmBlockSize = 16
+ gcmTagSize = 16
+ gcmStandardNonceSize = 12
+)
+
+var errOpen = errors.New("cipher: message authentication failed")
+
+// Assert that aesCipherGCM implements the gcmAble interface.
+var _ gcmAble = (*aesCipherAsm)(nil)
+
+type gcmAsm struct {
+ cipher *aesCipherAsm
+ // ks is the key schedule, the length of which depends on the size of
+ // the AES key.
+ ks []uint32
+ // productTable contains pre-computed multiples of the binary-field
+ // element used in GHASH.
+ productTable [256]byte
+ // nonceSize contains the expected size of the nonce, in bytes.
+ nonceSize int
+ // tagSize contains the size of the tag, in bytes.
+ tagSize int
+}
+
+// NewGCM returns the AES cipher wrapped in Galois Counter Mode. This is only
+// called by crypto/cipher.NewGCM via the gcmAble interface.
+func (c *aesCipherAsm) NewGCM(nonceSize, tagSize int) (cipher.AEAD, error) {
+ g := &gcmAsm{cipher: c, ks: c.enc, nonceSize: nonceSize, tagSize: tagSize}
+
+ hle := make([]byte, gcmBlockSize)
+ c.Encrypt(hle, hle)
+
+ // Reverse the bytes in each 8 byte chunk
+ // Load little endian, store big endian
+ h1 := binary.LittleEndian.Uint64(hle[:8])
+ h2 := binary.LittleEndian.Uint64(hle[8:])
+ binary.BigEndian.PutUint64(hle[:8], h1)
+ binary.BigEndian.PutUint64(hle[8:], h2)
+ gcmInit(&g.productTable, hle)
+
+ return g, nil
+}
+
+func (g *gcmAsm) NonceSize() int {
+ return g.nonceSize
+}
+
+func (g *gcmAsm) Overhead() int {
+ return g.tagSize
+}
+
+func sliceForAppend(in []byte, n int) (head, tail []byte) {
+ if total := len(in) + n; cap(in) >= total {
+ head = in[:total]
+ } else {
+ head = make([]byte, total)
+ copy(head, in)
+ }
+ tail = head[len(in):]
+ return
+}
+
+// deriveCounter computes the initial GCM counter state from the given nonce.
+func (g *gcmAsm) deriveCounter(counter *[gcmBlockSize]byte, nonce []byte) {
+ if len(nonce) == gcmStandardNonceSize {
+ copy(counter[:], nonce)
+ counter[gcmBlockSize-1] = 1
+ } else {
+ var hash [16]byte
+ g.paddedGHASH(&hash, nonce)
+ lens := gcmLengths(0, uint64(len(nonce))*8)
+ g.paddedGHASH(&hash, lens[:])
+ copy(counter[:], hash[:])
+ }
+}
+
+// counterCrypt encrypts in using AES in counter mode and places the result
+// into out. counter is the initial count value and will be updated with the next
+// count value. The length of out must be greater than or equal to the length
+// of in.
+func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte) {
+ var mask [gcmBlockSize]byte
+
+ for len(in) >= gcmBlockSize {
+ // Hint to avoid bounds check
+ _, _ = in[15], out[15]
+ g.cipher.Encrypt(mask[:], counter[:])
+ gcmInc32(counter)
+
+ // XOR 16 bytes each loop iteration in 8 byte chunks
+ in0 := binary.LittleEndian.Uint64(in[0:])
+ in1 := binary.LittleEndian.Uint64(in[8:])
+ m0 := binary.LittleEndian.Uint64(mask[:8])
+ m1 := binary.LittleEndian.Uint64(mask[8:])
+ binary.LittleEndian.PutUint64(out[:8], in0^m0)
+ binary.LittleEndian.PutUint64(out[8:], in1^m1)
+ out = out[16:]
+ in = in[16:]
+ }
+
+ if len(in) > 0 {
+ g.cipher.Encrypt(mask[:], counter[:])
+ gcmInc32(counter)
+ // XOR leftover bytes
+ for i, inb := range in {
+ out[i] = inb ^ mask[i]
+ }
+ }
+}
+
+// increments the rightmost 32-bits of the count value by 1.
+func gcmInc32(counterBlock *[16]byte) {
+ c := counterBlock[len(counterBlock)-4:]
+ x := binary.BigEndian.Uint32(c) + 1
+ binary.BigEndian.PutUint32(c, x)
+}
+
+// paddedGHASH pads data with zeroes until its length is a multiple of
+// 16-bytes. It then calculates a new value for hash using the ghash
+// algorithm.
+func (g *gcmAsm) paddedGHASH(hash *[16]byte, data []byte) {
+ if siz := len(data) - (len(data) % gcmBlockSize); siz > 0 {
+ gcmHash(hash[:], &g.productTable, data[:], siz)
+ data = data[siz:]
+ }
+ if len(data) > 0 {
+ var s [16]byte
+ copy(s[:], data)
+ gcmHash(hash[:], &g.productTable, s[:], len(s))
+ }
+}
+
+// auth calculates GHASH(ciphertext, additionalData), masks the result with
+// tagMask and writes the result to out.
+func (g *gcmAsm) auth(out, ciphertext, aad []byte, tagMask *[gcmTagSize]byte) {
+ var hash [16]byte
+ g.paddedGHASH(&hash, aad)
+ g.paddedGHASH(&hash, ciphertext)
+ lens := gcmLengths(uint64(len(aad))*8, uint64(len(ciphertext))*8)
+ g.paddedGHASH(&hash, lens[:])
+
+ copy(out, hash[:])
+ for i := range out {
+ out[i] ^= tagMask[i]
+ }
+}
+
+// Seal encrypts and authenticates plaintext. See the cipher.AEAD interface for
+// details.
+func (g *gcmAsm) Seal(dst, nonce, plaintext, data []byte) []byte {
+ if len(nonce) != g.nonceSize {
+ panic("cipher: incorrect nonce length given to GCM")
+ }
+ if uint64(len(plaintext)) > ((1<<32)-2)*BlockSize {
+ panic("cipher: message too large for GCM")
+ }
+
+ ret, out := sliceForAppend(dst, len(plaintext)+g.tagSize)
+
+ var counter, tagMask [gcmBlockSize]byte
+ g.deriveCounter(&counter, nonce)
+
+ g.cipher.Encrypt(tagMask[:], counter[:])
+ gcmInc32(&counter)
+
+ g.counterCrypt(out, plaintext, &counter)
+ g.auth(out[len(plaintext):], out[:len(plaintext)], data, &tagMask)
+
+ return ret
+}
+
+// Open authenticates and decrypts ciphertext. See the cipher.AEAD interface
+// for details.
+func (g *gcmAsm) Open(dst, nonce, ciphertext, data []byte) ([]byte, error) {
+ if len(nonce) != g.nonceSize {
+ panic("cipher: incorrect nonce length given to GCM")
+ }
+ if len(ciphertext) < g.tagSize {
+ return nil, errOpen
+ }
+ if uint64(len(ciphertext)) > ((1<<32)-2)*uint64(BlockSize)+uint64(g.tagSize) {
+ return nil, errOpen
+ }
+
+ tag := ciphertext[len(ciphertext)-g.tagSize:]
+ ciphertext = ciphertext[:len(ciphertext)-g.tagSize]
+
+ var counter, tagMask [gcmBlockSize]byte
+ g.deriveCounter(&counter, nonce)
+
+ g.cipher.Encrypt(tagMask[:], counter[:])
+ gcmInc32(&counter)
+
+ var expectedTag [gcmTagSize]byte
+ g.auth(expectedTag[:], ciphertext, data, &tagMask)
+
+ ret, out := sliceForAppend(dst, len(ciphertext))
+
+ if subtle.ConstantTimeCompare(expectedTag[:g.tagSize], tag) != 1 {
+ for i := range out {
+ out[i] = 0
+ }
+ return nil, errOpen
+ }
+
+ g.counterCrypt(out, ciphertext, &counter)
+ return ret, nil
+}
+
+func gcmLengths(len0, len1 uint64) [16]byte {
+ return [16]byte{
+ byte(len0 >> 56),
+ byte(len0 >> 48),
+ byte(len0 >> 40),
+ byte(len0 >> 32),
+ byte(len0 >> 24),
+ byte(len0 >> 16),
+ byte(len0 >> 8),
+ byte(len0),
+ byte(len1 >> 56),
+ byte(len1 >> 48),
+ byte(len1 >> 40),
+ byte(len1 >> 32),
+ byte(len1 >> 24),
+ byte(len1 >> 16),
+ byte(len1 >> 8),
+ byte(len1),
+ }
+}
--- /dev/null
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Based on CRYPTOGAMS code with the following comment:
+// # ====================================================================
+// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+// # project. The module is, however, dual licensed under OpenSSL and
+// # CRYPTOGAMS licenses depending on where you obtain it. For further
+// # details see http://www.openssl.org/~appro/cryptogams/.
+// # ====================================================================
+
+// This implementation is based on the ppc64 asm generated by the
+// script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
+// from commit d47afb3c.
+
+// Changes were made due to differences in the ABI and some register usage.
+// Some arguments were changed due to the way the Go code passes them.
+
+#include "textflag.h"
+
+#define XIP R3
+#define HTBL R4
+#define INP R5
+#define LEN R6
+
+#define XL V0
+#define XM V1
+#define XH V2
+#define IN V3
+#define ZERO V4
+#define T0 V5
+#define T1 V6
+#define T2 V7
+#define XC2 V8
+#define H V9
+#define HH V10
+#define HL V11
+#define LEMASK V12
+#define XL1 V13
+#define XM1 V14
+#define XH1 V15
+#define IN1 V16
+#define H2 V17
+#define H2H V18
+#define H2L V19
+#define XL3 V20
+#define XM2 V21
+#define IN2 V22
+#define H3L V23
+#define H3 V24
+#define H3H V25
+#define XH3 V26
+#define XM3 V27
+#define IN3 V28
+#define H4L V29
+#define H4 V30
+#define H4H V31
+
+#define IN0 IN
+#define H21L HL
+#define H21H HH
+#define LOPERM H2L
+#define HIPERM H2H
+
+#define VXL VS32
+#define VIN VS35
+#define VXC2 VS40
+#define VH VS41
+#define VHH VS42
+#define VHL VS43
+#define VIN1 VS48
+#define VH2 VS49
+#define VH2H VS50
+#define VH2L VS51
+
+#define VIN2 VS54
+#define VH3L VS55
+#define VH3 VS56
+#define VH3H VS57
+#define VIN3 VS60
+#define VH4L VS61
+#define VH4 VS62
+#define VH4H VS63
+
+#define VIN0 VIN
+
+// func gcmInit(productTable *[256]byte, h []byte)
+TEXT ·gcmInit(SB), NOSPLIT, $0-32
+ MOVD productTable+0(FP), XIP
+ MOVD h+8(FP), HTBL
+
+ MOVD $0x10, R8
+ MOVD $0x20, R9
+ MOVD $0x30, R10
+ LXVD2X (HTBL)(R0), VH // Load H
+
+ VSPLTISB $-16, XC2 // 0xf0
+ VSPLTISB $1, T0 // one
+ VADDUBM XC2, XC2, XC2 // 0xe0
+ VXOR ZERO, ZERO, ZERO
+ VOR XC2, T0, XC2 // 0xe1
+ VSLDOI $15, XC2, ZERO, XC2 // 0xe1...
+ VSLDOI $1, ZERO, T0, T1 // ...1
+ VADDUBM XC2, XC2, XC2 // 0xc2...
+ VSPLTISB $7, T2
+ VOR XC2, T1, XC2 // 0xc2....01
+ VSPLTB $0, H, T1 // most significant byte
+ VSL H, T0, H // H<<=1
+ VSRAB T1, T2, T1 // broadcast carry bit
+ VAND T1, XC2, T1
+ VXOR H, T1, IN // twisted H
+
+ VSLDOI $8, IN, IN, H // twist even more ...
+ VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
+ VSLDOI $8, ZERO, H, HL // ... and split
+ VSLDOI $8, H, ZERO, HH
+
+ STXVD2X VXC2, (XIP+R0) // save pre-computed table
+ STXVD2X VHL, (XIP+R8)
+ MOVD $0x40, R8
+ STXVD2X VH, (XIP+R9)
+ MOVD $0x50, R9
+ STXVD2X VHH, (XIP+R10)
+ MOVD $0x60, R10
+
+ VPMSUMD IN, HL, XL // H.lo·H.lo
+ VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi
+ VPMSUMD IN, HH, XH // H.hi·H.hi
+
+ VPMSUMD XL, XC2, T2 // 1st reduction phase
+
+ VSLDOI $8, XM, ZERO, T0
+ VSLDOI $8, ZERO, XM, T1
+ VXOR XL, T0, XL
+ VXOR XH, T1, XH
+
+ VSLDOI $8, XL, XL, XL
+ VXOR XL, T2, XL
+
+ VSLDOI $8, XL, XL, T1 // 2nd reduction phase
+ VPMSUMD XL, XC2, XL
+ VXOR T1, XH, T1
+ VXOR XL, T1, IN1
+
+ VSLDOI $8, IN1, IN1, H2
+ VSLDOI $8, ZERO, H2, H2L
+ VSLDOI $8, H2, ZERO, H2H
+
+ STXVD2X VH2L, (XIP+R8) // save H^2
+ MOVD $0x70, R8
+ STXVD2X VH2, (XIP+R9)
+ MOVD $0x80, R9
+ STXVD2X VH2H, (XIP+R10)
+ MOVD $0x90, R10
+
+ VPMSUMD IN, H2L, XL // H.lo·H^2.lo
+ VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
+ VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi
+ VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi
+ VPMSUMD IN, H2H, XH // H.hi·H^2.hi
+ VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
+
+ VPMSUMD XL, XC2, T2 // 1st reduction phase
+ VPMSUMD XL1, XC2, HH // 1st reduction phase
+
+ VSLDOI $8, XM, ZERO, T0
+ VSLDOI $8, ZERO, XM, T1
+ VSLDOI $8, XM1, ZERO, HL
+ VSLDOI $8, ZERO, XM1, H
+ VXOR XL, T0, XL
+ VXOR XH, T1, XH
+ VXOR XL1, HL, XL1
+ VXOR XH1, H, XH1
+
+ VSLDOI $8, XL, XL, XL
+ VSLDOI $8, XL1, XL1, XL1
+ VXOR XL, T2, XL
+ VXOR XL1, HH, XL1
+
+ VSLDOI $8, XL, XL, T1 // 2nd reduction phase
+ VSLDOI $8, XL1, XL1, H // 2nd reduction phase
+ VPMSUMD XL, XC2, XL
+ VPMSUMD XL1, XC2, XL1
+ VXOR T1, XH, T1
+ VXOR H, XH1, H
+ VXOR XL, T1, XL
+ VXOR XL1, H, XL1
+
+ VSLDOI $8, XL, XL, H
+ VSLDOI $8, XL1, XL1, H2
+ VSLDOI $8, ZERO, H, HL
+ VSLDOI $8, H, ZERO, HH
+ VSLDOI $8, ZERO, H2, H2L
+ VSLDOI $8, H2, ZERO, H2H
+
+ STXVD2X VHL, (XIP+R8) // save H^3
+ MOVD $0xa0, R8
+ STXVD2X VH, (XIP+R9)
+ MOVD $0xb0, R9
+ STXVD2X VHH, (XIP+R10)
+ MOVD $0xc0, R10
+ STXVD2X VH2L, (XIP+R8) // save H^4
+ STXVD2X VH2, (XIP+R9)
+ STXVD2X VH2H, (XIP+R10)
+
+ RET
+
+// func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
+TEXT ·gcmHash(SB), NOSPLIT, $0-64
+ MOVD output+0(FP), XIP
+ MOVD productTable+24(FP), HTBL
+ MOVD inp+32(FP), INP
+ MOVD len+56(FP), LEN
+
+ MOVD $0x10, R8
+ MOVD $0x20, R9
+ MOVD $0x30, R10
+ LXVD2X (XIP)(R0), VXL // load Xi
+
+ LXVD2X (HTBL)(R8), VHL // load pre-computed table
+ MOVD $0x40, R8
+ LVSL (R0)(R0), LEMASK
+ LXVD2X (HTBL)(R9), VH
+ MOVD $0x50, R9
+ VSPLTISB $0x07, T0
+ LXVD2X (HTBL)(R10), VHH
+ MOVD $0x60, R10
+ VXOR LEMASK, T0, LEMASK
+ LXVD2X (HTBL)(R0), VXC2
+ VPERM XL, XL, LEMASK, XL
+ VXOR ZERO, ZERO, ZERO
+
+ CMPU LEN, $64
+ BGE gcm_ghash_p8_4x
+
+ LXVD2X (INP)(R0), VIN
+ ADD $16, INP, INP
+ SUBCCC $16, LEN, LEN
+ VPERM IN, IN, LEMASK, IN
+ VXOR IN, XL, IN
+ BEQ short
+
+ LXVD2X (HTBL)(R8), VH2L // load H^2
+ MOVD $16, R8
+ LXVD2X (HTBL)(R9), VH2
+ ADD LEN, INP, R9 // end of input
+ LXVD2X (HTBL)(R10), VH2H
+
+loop_2x:
+ LXVD2X (INP)(R0), VIN1
+ VPERM IN1, IN1, LEMASK, IN1
+
+ SUBC $32, LEN, LEN
+ VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo
+ VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo
+ SUBE R11, R11, R11 // borrow?-1:0
+ VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi
+ VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi
+ AND LEN, R11, R11
+ VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi
+ VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi
+ ADD R11, INP, INP
+
+ VXOR XL, XL1, XL
+ VXOR XM, XM1, XM
+
+ VPMSUMD XL, XC2, T2 // 1st reduction phase
+
+ VSLDOI $8, XM, ZERO, T0
+ VSLDOI $8, ZERO, XM, T1
+ VXOR XH, XH1, XH
+ VXOR XL, T0, XL
+ VXOR XH, T1, XH
+
+ VSLDOI $8, XL, XL, XL
+ VXOR XL, T2, XL
+ LXVD2X (INP)(R8), VIN
+ ADD $32, INP, INP
+
+ VSLDOI $8, XL, XL, T1 // 2nd reduction phase
+ VPMSUMD XL, XC2, XL
+ VPERM IN, IN, LEMASK, IN
+ VXOR T1, XH, T1
+ VXOR IN, T1, IN
+ VXOR IN, XL, IN
+ CMP R9, INP
+ BGT loop_2x // done yet?
+
+ CMPWU LEN, $0
+ BNE even
+
+short:
+ VPMSUMD IN, HL, XL // H.lo·Xi.lo
+ VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
+ VPMSUMD IN, HH, XH // H.hi·Xi.hi
+
+ VPMSUMD XL, XC2, T2 // 1st reduction phase
+
+ VSLDOI $8, XM, ZERO, T0
+ VSLDOI $8, ZERO, XM, T1
+ VXOR XL, T0, XL
+ VXOR XH, T1, XH
+
+ VSLDOI $8, XL, XL, XL
+ VXOR XL, T2, XL
+
+ VSLDOI $8, XL, XL, T1 // 2nd reduction phase
+ VPMSUMD XL, XC2, XL
+ VXOR T1, XH, T1
+
+even:
+ VXOR XL, T1, XL
+ VPERM XL, XL, LEMASK, XL
+ STXVD2X VXL, (XIP+R0)
+
+ OR R12, R12, R12 // write out Xi
+ RET
+
+gcm_ghash_p8_4x:
+ LVSL (R8)(R0), T0 // 0x0001..0e0f
+ MOVD $0x70, R8
+ LXVD2X (HTBL)(R9), VH2
+ MOVD $0x80, R9
+ VSPLTISB $8, T1 // 0x0808..0808
+ MOVD $0x90, R10
+ LXVD2X (HTBL)(R8), VH3L // load H^3
+ MOVD $0xa0, R8
+ LXVD2X (HTBL)(R9), VH3
+ MOVD $0xb0, R9
+ LXVD2X (HTBL)(R10), VH3H
+ MOVD $0xc0, R10
+ LXVD2X (HTBL)(R8), VH4L // load H^4
+ MOVD $0x10, R8
+ LXVD2X (HTBL)(R9), VH4
+ MOVD $0x20, R9
+ LXVD2X (HTBL)(R10), VH4H
+ MOVD $0x30, R10
+
+ VSLDOI $8, ZERO, T1, T2 // 0x0000..0808
+ VADDUBM T0, T2, HIPERM // 0x0001..1617
+ VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
+
+ SRD $4, LEN, LEN // this allows to use sign bit as carry
+
+ LXVD2X (INP)(R0), VIN0 // load input
+ LXVD2X (INP)(R8), VIN1
+ SUBCCC $8, LEN, LEN
+ LXVD2X (INP)(R9), VIN2
+ LXVD2X (INP)(R10), VIN3
+ ADD $0x40, INP, INP
+ VPERM IN0, IN0, LEMASK, IN0
+ VPERM IN1, IN1, LEMASK, IN1
+ VPERM IN2, IN2, LEMASK, IN2
+ VPERM IN3, IN3, LEMASK, IN3
+
+ VXOR IN0, XL, XH
+
+ VPMSUMD IN1, H3L, XL1
+ VPMSUMD IN1, H3, XM1
+ VPMSUMD IN1, H3H, XH1
+
+ VPERM H2, H, HIPERM, H21L
+ VPERM IN2, IN3, LOPERM, T0
+ VPERM H2, H, LOPERM, H21H
+ VPERM IN2, IN3, HIPERM, T1
+ VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
+ VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
+ VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
+ VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
+
+ VXOR XM2, XM1, XM2
+ VXOR XL3, XL1, XL3
+ VXOR XM3, XM2, XM3
+ VXOR XH3, XH1, XH3
+
+ BLT tail_4x
+
+loop_4x:
+ LXVD2X (INP)(R0), VIN0
+ LXVD2X (INP)(R8), VIN1
+ SUBCCC $4, LEN, LEN
+ LXVD2X (INP)(R9), VIN2
+ LXVD2X (INP)(R10), VIN3
+ ADD $0x40, INP, INP
+ VPERM IN1, IN1, LEMASK, IN1
+ VPERM IN2, IN2, LEMASK, IN2
+ VPERM IN3, IN3, LEMASK, IN3
+ VPERM IN0, IN0, LEMASK, IN0
+
+ VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
+ VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
+ VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
+ VPMSUMD IN1, H3L, XL1
+ VPMSUMD IN1, H3, XM1
+ VPMSUMD IN1, H3H, XH1
+
+ VXOR XL, XL3, XL
+ VXOR XM, XM3, XM
+ VXOR XH, XH3, XH
+ VPERM IN2, IN3, LOPERM, T0
+ VPERM IN2, IN3, HIPERM, T1
+
+ VPMSUMD XL, XC2, T2 // 1st reduction phase
+ VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo
+ VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi
+
+ VSLDOI $8, XM, ZERO, T0
+ VSLDOI $8, ZERO, XM, T1
+ VXOR XL, T0, XL
+ VXOR XH, T1, XH
+
+ VSLDOI $8, XL, XL, XL
+ VXOR XL, T2, XL
+
+ VSLDOI $8, XL, XL, T1 // 2nd reduction phase
+ VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
+ VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi
+ VPMSUMD XL, XC2, XL
+
+ VXOR XL3, XL1, XL3
+ VXOR XH3, XH1, XH3
+ VXOR XH, IN0, XH
+ VXOR XM2, XM1, XM2
+ VXOR XH, T1, XH
+ VXOR XM3, XM2, XM3
+ VXOR XH, XL, XH
+ BGE loop_4x
+
+tail_4x:
+ VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
+ VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi
+ VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
+
+ VXOR XL, XL3, XL
+ VXOR XM, XM3, XM
+
+ VPMSUMD XL, XC2, T2 // 1st reduction phase
+
+ VSLDOI $8, XM, ZERO, T0
+ VSLDOI $8, ZERO, XM, T1
+ VXOR XH, XH3, XH
+ VXOR XL, T0, XL
+ VXOR XH, T1, XH
+
+ VSLDOI $8, XL, XL, XL
+ VXOR XL, T2, XL
+
+ VSLDOI $8, XL, XL, T1 // 2nd reduction phase
+ VPMSUMD XL, XC2, XL
+ VXOR T1, XH, T1
+ VXOR XL, T1, XL
+
+ ADDCCC $4, LEN, LEN
+ BEQ done_4x
+
+ LXVD2X (INP)(R0), VIN0
+ CMPU LEN, $2
+ MOVD $-4, LEN
+ BLT one
+ LXVD2X (INP)(R8), VIN1
+ BEQ two
+
+three:
+ LXVD2X (INP)(R9), VIN2
+ VPERM IN0, IN0, LEMASK, IN0
+ VPERM IN1, IN1, LEMASK, IN1
+ VPERM IN2, IN2, LEMASK, IN2
+
+ VXOR IN0, XL, XH
+ VOR H3L, H3L, H4L
+ VOR H3, H3, H4
+ VOR H3H, H3H, H4H
+
+ VPERM IN1, IN2, LOPERM, T0
+ VPERM IN1, IN2, HIPERM, T1
+ VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
+ VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi
+ VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
+ VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
+
+ VXOR XM3, XM2, XM3
+ JMP tail_4x
+
+two:
+ VPERM IN0, IN0, LEMASK, IN0
+ VPERM IN1, IN1, LEMASK, IN1
+
+ VXOR IN, XL, XH
+ VPERM ZERO, IN1, LOPERM, T0
+ VPERM ZERO, IN1, HIPERM, T1
+
+ VSLDOI $8, ZERO, H2, H4L
+ VOR H2, H2, H4
+ VSLDOI $8, H2, ZERO, H4H
+
+ VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
+ VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi
+ VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
+
+ JMP tail_4x
+
+one:
+ VPERM IN0, IN0, LEMASK, IN0
+
+ VSLDOI $8, ZERO, H, H4L
+ VOR H, H, H4
+ VSLDOI $8, H, ZERO, H4H
+
+ VXOR IN0, XL, XH
+ VXOR XL3, XL3, XL3
+ VXOR XM3, XM3, XM3
+ VXOR XH3, XH3, XH3
+
+ JMP tail_4x
+
+done_4x:
+ VPERM XL, XL, LEMASK, XL
+ STXVD2X VXL, (XIP+R0) // write out Xi
+ RET
+
+// func gcmMul(output []byte, productTable *[256]byte)
+TEXT ·gcmMul(SB), NOSPLIT, $0-32
+ MOVD output+0(FP), XIP
+ MOVD productTable+24(FP), HTBL
+
+ MOVD $0x10, R8
+ MOVD $0x20, R9
+ MOVD $0x30, R10
+ LXVD2X (XIP)(R0), VIN // load Xi
+
+ LXVD2X (HTBL)(R8), VHL // Load pre-computed table
+ LVSL (R0)(R0), LEMASK
+ LXVD2X (HTBL)(R9), VH
+ VSPLTISB $0x07, T0
+ LXVD2X (HTBL)(R10), VHH
+ VXOR LEMASK, T0, LEMASK
+ LXVD2X (HTBL)(R0), VXC2
+ VPERM IN, IN, LEMASK, IN
+ VXOR ZERO, ZERO, ZERO
+
+ VPMSUMD IN, HL, XL // H.lo·Xi.lo
+ VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi
+ VPMSUMD IN, HH, XH // H.hi·Xi.hi
+
+ VPMSUMD XL, XC2, T2 // 1st reduction phase
+
+ VSLDOI $8, XM, ZERO, T0
+ VSLDOI $8, ZERO, XM, T1
+ VXOR XL, T0, XL
+ VXOR XH, T1, XH
+
+ VSLDOI $8, XL, XL, XL
+ VXOR XL, T2, XL
+
+ VSLDOI $8, XL, XL, T1 // 2nd reduction phase
+ VPMSUMD XL, XC2, XL
+ VXOR T1, XH, T1
+ VXOR XL, T1, XL
+
+ VPERM XL, XL, LEMASK, XL
+ STXVD2X VXL, (XIP+R0) // write out Xi
+ RET