diff options
Diffstat (limited to 'vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s')
-rw-r--r-- | vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s | 501 |
1 files changed, 501 insertions, 0 deletions
diff --git a/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s new file mode 100644 index 000000000..a660b4112 --- /dev/null +++ b/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s @@ -0,0 +1,501 @@ +// Copyright 2019 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Based on CRYPTOGAMS code with the following comment: +// # ==================================================================== +// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL +// # project. The module is, however, dual licensed under OpenSSL and +// # CRYPTOGAMS licenses depending on where you obtain it. For further +// # details see http://www.openssl.org/~appro/cryptogams/. +// # ==================================================================== + +// Code for the perl script that generates the ppc64 assembler +// can be found in the cryptogams repository at the link below. It is based on +// the original from openssl. + +// https://github.com/dot-asm/cryptogams/commit/a60f5b50ed908e91 + +// The differences in this and the original implementation are +// due to the calling conventions and initialization of constants. + +//go:build gc && !purego && (ppc64 || ppc64le) + +#include "textflag.h" + +#define OUT R3 +#define INP R4 +#define LEN R5 +#define KEY R6 +#define CNT R7 +#define TMP R15 + +#define CONSTBASE R16 +#define BLOCKS R17 + +// for VPERMXOR +#define MASK R18 + +DATA consts<>+0x00(SB)/4, $0x61707865 +DATA consts<>+0x04(SB)/4, $0x3320646e +DATA consts<>+0x08(SB)/4, $0x79622d32 +DATA consts<>+0x0c(SB)/4, $0x6b206574 +DATA consts<>+0x10(SB)/4, $0x00000001 +DATA consts<>+0x14(SB)/4, $0x00000000 +DATA consts<>+0x18(SB)/4, $0x00000000 +DATA consts<>+0x1c(SB)/4, $0x00000000 +DATA consts<>+0x20(SB)/4, $0x00000004 +DATA consts<>+0x24(SB)/4, $0x00000000 +DATA consts<>+0x28(SB)/4, $0x00000000 +DATA consts<>+0x2c(SB)/4, $0x00000000 +DATA consts<>+0x30(SB)/4, $0x0e0f0c0d +DATA consts<>+0x34(SB)/4, $0x0a0b0809 +DATA consts<>+0x38(SB)/4, $0x06070405 +DATA consts<>+0x3c(SB)/4, $0x02030001 +DATA consts<>+0x40(SB)/4, $0x0d0e0f0c +DATA consts<>+0x44(SB)/4, $0x090a0b08 +DATA consts<>+0x48(SB)/4, $0x05060704 +DATA consts<>+0x4c(SB)/4, $0x01020300 +DATA consts<>+0x50(SB)/4, $0x61707865 +DATA consts<>+0x54(SB)/4, $0x61707865 +DATA consts<>+0x58(SB)/4, $0x61707865 +DATA consts<>+0x5c(SB)/4, $0x61707865 +DATA consts<>+0x60(SB)/4, $0x3320646e +DATA consts<>+0x64(SB)/4, $0x3320646e +DATA consts<>+0x68(SB)/4, $0x3320646e +DATA consts<>+0x6c(SB)/4, $0x3320646e +DATA consts<>+0x70(SB)/4, $0x79622d32 +DATA consts<>+0x74(SB)/4, $0x79622d32 +DATA consts<>+0x78(SB)/4, $0x79622d32 +DATA consts<>+0x7c(SB)/4, $0x79622d32 +DATA consts<>+0x80(SB)/4, $0x6b206574 +DATA consts<>+0x84(SB)/4, $0x6b206574 +DATA consts<>+0x88(SB)/4, $0x6b206574 +DATA consts<>+0x8c(SB)/4, $0x6b206574 +DATA consts<>+0x90(SB)/4, $0x00000000 +DATA consts<>+0x94(SB)/4, $0x00000001 +DATA consts<>+0x98(SB)/4, $0x00000002 +DATA consts<>+0x9c(SB)/4, $0x00000003 +DATA consts<>+0xa0(SB)/4, $0x11223300 +DATA consts<>+0xa4(SB)/4, $0x55667744 +DATA consts<>+0xa8(SB)/4, $0x99aabb88 +DATA consts<>+0xac(SB)/4, $0xddeeffcc +DATA consts<>+0xb0(SB)/4, $0x22330011 +DATA consts<>+0xb4(SB)/4, $0x66774455 +DATA consts<>+0xb8(SB)/4, $0xaabb8899 +DATA consts<>+0xbc(SB)/4, $0xeeffccdd +GLOBL consts<>(SB), RODATA, $0xc0 + +#ifdef GOARCH_ppc64 +#define BE_XXBRW_INIT() \ + LVSL (R0)(R0), V24 \ + VSPLTISB $3, V25 \ + VXOR V24, V25, V24 \ + +#define BE_XXBRW(vr) VPERM vr, vr, V24, vr +#else +#define BE_XXBRW_INIT() +#define BE_XXBRW(vr) +#endif + +//func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) +TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 + MOVD out+0(FP), OUT + MOVD inp+8(FP), INP + MOVD len+16(FP), LEN + MOVD key+24(FP), KEY + MOVD counter+32(FP), CNT + + // Addressing for constants + MOVD $consts<>+0x00(SB), CONSTBASE + MOVD $16, R8 + MOVD $32, R9 + MOVD $48, R10 + MOVD $64, R11 + SRD $6, LEN, BLOCKS + // for VPERMXOR + MOVD $consts<>+0xa0(SB), MASK + MOVD $16, R20 + // V16 + LXVW4X (CONSTBASE)(R0), VS48 + ADD $80,CONSTBASE + + // Load key into V17,V18 + LXVW4X (KEY)(R0), VS49 + LXVW4X (KEY)(R8), VS50 + + // Load CNT, NONCE into V19 + LXVW4X (CNT)(R0), VS51 + + // Clear V27 + VXOR V27, V27, V27 + + BE_XXBRW_INIT() + + // V28 + LXVW4X (CONSTBASE)(R11), VS60 + + // Load mask constants for VPERMXOR + LXVW4X (MASK)(R0), V20 + LXVW4X (MASK)(R20), V21 + + // splat slot from V19 -> V26 + VSPLTW $0, V19, V26 + + VSLDOI $4, V19, V27, V19 + VSLDOI $12, V27, V19, V19 + + VADDUWM V26, V28, V26 + + MOVD $10, R14 + MOVD R14, CTR + PCALIGN $16 +loop_outer_vsx: + // V0, V1, V2, V3 + LXVW4X (R0)(CONSTBASE), VS32 + LXVW4X (R8)(CONSTBASE), VS33 + LXVW4X (R9)(CONSTBASE), VS34 + LXVW4X (R10)(CONSTBASE), VS35 + + // splat values from V17, V18 into V4-V11 + VSPLTW $0, V17, V4 + VSPLTW $1, V17, V5 + VSPLTW $2, V17, V6 + VSPLTW $3, V17, V7 + VSPLTW $0, V18, V8 + VSPLTW $1, V18, V9 + VSPLTW $2, V18, V10 + VSPLTW $3, V18, V11 + + // VOR + VOR V26, V26, V12 + + // splat values from V19 -> V13, V14, V15 + VSPLTW $1, V19, V13 + VSPLTW $2, V19, V14 + VSPLTW $3, V19, V15 + + // splat const values + VSPLTISW $-16, V27 + VSPLTISW $12, V28 + VSPLTISW $8, V29 + VSPLTISW $7, V30 + PCALIGN $16 +loop_vsx: + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VPERMXOR V12, V0, V21, V12 + VPERMXOR V13, V1, V21, V13 + VPERMXOR V14, V2, V21, V14 + VPERMXOR V15, V3, V21, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V28, V4 + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + + VADDUWM V0, V4, V0 + VADDUWM V1, V5, V1 + VADDUWM V2, V6, V2 + VADDUWM V3, V7, V3 + + VPERMXOR V12, V0, V20, V12 + VPERMXOR V13, V1, V20, V13 + VPERMXOR V14, V2, V20, V14 + VPERMXOR V15, V3, V20, V15 + + VADDUWM V8, V12, V8 + VADDUWM V9, V13, V9 + VADDUWM V10, V14, V10 + VADDUWM V11, V15, V11 + + VXOR V4, V8, V4 + VXOR V5, V9, V5 + VXOR V6, V10, V6 + VXOR V7, V11, V7 + + VRLW V4, V30, V4 + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VPERMXOR V15, V0, V21, V15 + VPERMXOR V12, V1, V21, V12 + VPERMXOR V13, V2, V21, V13 + VPERMXOR V14, V3, V21, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V28, V5 + VRLW V6, V28, V6 + VRLW V7, V28, V7 + VRLW V4, V28, V4 + + VADDUWM V0, V5, V0 + VADDUWM V1, V6, V1 + VADDUWM V2, V7, V2 + VADDUWM V3, V4, V3 + + VPERMXOR V15, V0, V20, V15 + VPERMXOR V12, V1, V20, V12 + VPERMXOR V13, V2, V20, V13 + VPERMXOR V14, V3, V20, V14 + + VADDUWM V10, V15, V10 + VADDUWM V11, V12, V11 + VADDUWM V8, V13, V8 + VADDUWM V9, V14, V9 + + VXOR V5, V10, V5 + VXOR V6, V11, V6 + VXOR V7, V8, V7 + VXOR V4, V9, V4 + + VRLW V5, V30, V5 + VRLW V6, V30, V6 + VRLW V7, V30, V7 + VRLW V4, V30, V4 + BDNZ loop_vsx + + VADDUWM V12, V26, V12 + + VMRGEW V0, V1, V27 + VMRGEW V2, V3, V28 + + VMRGOW V0, V1, V0 + VMRGOW V2, V3, V2 + + VMRGEW V4, V5, V29 + VMRGEW V6, V7, V30 + + XXPERMDI VS32, VS34, $0, VS33 + XXPERMDI VS32, VS34, $3, VS35 + XXPERMDI VS59, VS60, $0, VS32 + XXPERMDI VS59, VS60, $3, VS34 + + VMRGOW V4, V5, V4 + VMRGOW V6, V7, V6 + + VMRGEW V8, V9, V27 + VMRGEW V10, V11, V28 + + XXPERMDI VS36, VS38, $0, VS37 + XXPERMDI VS36, VS38, $3, VS39 + XXPERMDI VS61, VS62, $0, VS36 + XXPERMDI VS61, VS62, $3, VS38 + + VMRGOW V8, V9, V8 + VMRGOW V10, V11, V10 + + VMRGEW V12, V13, V29 + VMRGEW V14, V15, V30 + + XXPERMDI VS40, VS42, $0, VS41 + XXPERMDI VS40, VS42, $3, VS43 + XXPERMDI VS59, VS60, $0, VS40 + XXPERMDI VS59, VS60, $3, VS42 + + VMRGOW V12, V13, V12 + VMRGOW V14, V15, V14 + + VSPLTISW $4, V27 + VADDUWM V26, V27, V26 + + XXPERMDI VS44, VS46, $0, VS45 + XXPERMDI VS44, VS46, $3, VS47 + XXPERMDI VS61, VS62, $0, VS44 + XXPERMDI VS61, VS62, $3, VS46 + + VADDUWM V0, V16, V0 + VADDUWM V4, V17, V4 + VADDUWM V8, V18, V8 + VADDUWM V12, V19, V12 + + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + + CMPU LEN, $64 + BLT tail_vsx + + // Bottom of loop + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V1, V16, V0 + VADDUWM V5, V17, V4 + VADDUWM V9, V18, V8 + VADDUWM V13, V19, V12 + + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(V10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V2, V16, V0 + VADDUWM V6, V17, V4 + VADDUWM V10, V18, V8 + VADDUWM V14, V19, V12 + + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + BEQ done_vsx + + VADDUWM V3, V16, V0 + VADDUWM V7, V17, V4 + VADDUWM V11, V18, V8 + VADDUWM V15, V19, V12 + + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + + CMPU LEN, $64 + BLT tail_vsx + + LXVW4X (INP)(R0), VS59 + LXVW4X (INP)(R8), VS60 + LXVW4X (INP)(R9), VS61 + LXVW4X (INP)(R10), VS62 + + VXOR V27, V0, V27 + VXOR V28, V4, V28 + VXOR V29, V8, V29 + VXOR V30, V12, V30 + + STXVW4X VS59, (OUT)(R0) + STXVW4X VS60, (OUT)(R8) + ADD $64, INP + STXVW4X VS61, (OUT)(R9) + ADD $-64, LEN + STXVW4X VS62, (OUT)(R10) + ADD $64, OUT + + MOVD $10, R14 + MOVD R14, CTR + BNE loop_outer_vsx + +done_vsx: + // Increment counter by number of 64 byte blocks + MOVWZ (CNT), R14 + ADD BLOCKS, R14 + MOVWZ R14, (CNT) + RET + +tail_vsx: + ADD $32, R1, R11 + MOVD LEN, CTR + + // Save values on stack to copy from + STXVW4X VS32, (R11)(R0) + STXVW4X VS36, (R11)(R8) + STXVW4X VS40, (R11)(R9) + STXVW4X VS44, (R11)(R10) + ADD $-1, R11, R12 + ADD $-1, INP + ADD $-1, OUT + PCALIGN $16 +looptail_vsx: + // Copying the result to OUT + // in bytes. + MOVBZU 1(R12), KEY + MOVBZU 1(INP), TMP + XOR KEY, TMP, KEY + MOVBU KEY, 1(OUT) + BDNZ looptail_vsx + + // Clear the stack values + STXVW4X VS48, (R11)(R0) + STXVW4X VS48, (R11)(R8) + STXVW4X VS48, (R11)(R9) + STXVW4X VS48, (R11)(R10) + BR done_vsx |