diff options
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_avx-x86_64.S | 2811 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_glue.c | 147 | ||||
-rw-r--r-- | arch/x86/crypto/blowfish_glue.c | 3 | ||||
-rw-r--r-- | arch/x86/crypto/cast5_avx_glue.c | 3 | ||||
-rw-r--r-- | arch/x86/crypto/ghash-clmulni-intel_asm.S | 29 | ||||
-rw-r--r-- | arch/x86/crypto/ghash-clmulni-intel_glue.c | 14 | ||||
-rw-r--r-- | arch/x86/crypto/sha1_avx2_x86_64_asm.S | 708 | ||||
-rw-r--r-- | arch/x86/crypto/sha1_ssse3_glue.c | 53 |
9 files changed, 3725 insertions, 47 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index e0fc24db234a..61d6e281898b 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -76,8 +76,12 @@ ifeq ($(avx2_supported),yes) endif aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o +aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o +ifeq ($(avx2_supported),yes) +sha1-ssse3-y += sha1_avx2_x86_64_asm.o +endif crc32c-intel-y := crc32c-intel_glue.o crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S new file mode 100644 index 000000000000..522ab68d1c88 --- /dev/null +++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S @@ -0,0 +1,2811 @@ +######################################################################## +# Copyright (c) 2013, Intel Corporation +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR +# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +## +## Authors: +## Erdinc Ozturk <erdinc.ozturk@intel.com> +## Vinodh Gopal <vinodh.gopal@intel.com> +## James Guilford <james.guilford@intel.com> +## Tim Chen <tim.c.chen@linux.intel.com> +## +## References: +## This code was derived and highly optimized from the code described in paper: +## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation +## on Intel Architecture Processors. August, 2010 +## The details of the implementation is explained in: +## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode +## on Intel Architecture Processors. October, 2012. +## +## Assumptions: +## +## +## +## iv: +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | Salt (From the SA) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | Initialization Vector | +## | (This is the sequence number from IPSec header) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x1 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## +## +## AAD: +## AAD padded to 128 bits with 0 +## for example, assume AAD is a u32 vector +## +## if AAD is 8 bytes: +## AAD[3] = {A0, A1}# +## padded AAD in xmm register = {A1 A0 0 0} +## +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | SPI (A1) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 32-bit Sequence Number (A0) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x0 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## AAD Format with 32-bit Sequence Number +## +## if AAD is 12 bytes: +## AAD[3] = {A0, A1, A2}# +## padded AAD in xmm register = {A2 A1 A0 0} +## +## 0 1 2 3 +## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | SPI (A2) | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 64-bit Extended Sequence Number {A1,A0} | +## | | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## | 0x0 | +## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +## +## AAD Format with 64-bit Extended Sequence Number +## +## +## aadLen: +## from the definition of the spec, aadLen can only be 8 or 12 bytes. +## The code additionally supports aadLen of length 16 bytes. +## +## TLen: +## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +## +## poly = x^128 + x^127 + x^126 + x^121 + 1 +## throughout the code, one tab and two tab indentations are used. one tab is +## for GHASH part, two tabs is for AES part. +## + +#include <linux/linkage.h> +#include <asm/inst.h> + +.data +.align 16 + +POLY: .octa 0xC2000000000000000000000000000001 +POLY2: .octa 0xC20000000000000000000001C2000000 +TWOONE: .octa 0x00000001000000000000000000000001 + +# order of these constants should not change. +# more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F + +SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +ALL_F: .octa 0xffffffffffffffffffffffffffffffff +ZERO: .octa 0x00000000000000000000000000000000 +ONE: .octa 0x00000000000000000000000000000001 +ONEf: .octa 0x01000000000000000000000000000000 + +.text + + +##define the fields of the gcm aes context +#{ +# u8 expanded_keys[16*11] store expanded keys +# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here +# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here +# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here +# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here +# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here +# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here +# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here +# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here +# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes) +# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes) +#} gcm_ctx# + +HashKey = 16*11 # store HashKey <<1 mod poly here +HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here +HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here +HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here +HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here +HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here +HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here +HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here +HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) +HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx +#define arg4 %rcx +#define arg5 %r8 +#define arg6 %r9 +#define arg7 STACK_OFFSET+8*1(%r14) +#define arg8 STACK_OFFSET+8*2(%r14) +#define arg9 STACK_OFFSET+8*3(%r14) + +i = 0 +j = 0 + +out_order = 0 +in_order = 1 +DEC = 0 +ENC = 1 + +.macro define_reg r n +reg_\r = %xmm\n +.endm + +.macro setreg +.altmacro +define_reg i %i +define_reg j %j +.noaltmacro +.endm + +# need to push 4 registers into stack to maintain +STACK_OFFSET = 8*4 + +TMP1 = 16*0 # Temporary storage for AAD +TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) +TMP3 = 16*2 # Temporary storage for AES State 3 +TMP4 = 16*3 # Temporary storage for AES State 4 +TMP5 = 16*4 # Temporary storage for AES State 5 +TMP6 = 16*5 # Temporary storage for AES State 6 +TMP7 = 16*6 # Temporary storage for AES State 7 +TMP8 = 16*7 # Temporary storage for AES State 8 + +VARIABLE_OFFSET = 16*8 + +################################ +# Utility Macros +################################ + +# Encryption of a single block +.macro ENCRYPT_SINGLE_BLOCK XMM0 + vpxor (arg1), \XMM0, \XMM0 + i = 1 + setreg +.rep 9 + vaesenc 16*i(arg1), \XMM0, \XMM0 + i = (i+1) + setreg +.endr + vaesenclast 16*10(arg1), \XMM0, \XMM0 +.endm + +#ifdef CONFIG_AS_AVX +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 + + vpshufd $0b01001110, \GH, \T2 + vpshufd $0b01001110, \HK, \T3 + vpxor \GH , \T2, \T2 # T2 = (a1+a0) + vpxor \HK , \T3, \T3 # T3 = (b1+b0) + + vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 + vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 + vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) + vpxor \GH, \T2,\T2 + vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 + + vpslldq $8, \T2,\T3 # shift-L T3 2 DWs + vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs + vpxor \T3, \GH, \GH + vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK + + #first phase of the reduction + vpslld $31, \GH, \T2 # packed right shifting << 31 + vpslld $30, \GH, \T3 # packed right shifting shift << 30 + vpslld $25, \GH, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T5 # shift-R T5 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \GH, \GH # first phase of the reduction complete + + #second phase of the reduction + + vpsrld $1,\GH, \T2 # packed left shifting >> 1 + vpsrld $2,\GH, \T3 # packed left shifting >> 2 + vpsrld $7,\GH, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T5, \T2, \T2 + vpxor \T2, \GH, \GH + vpxor \T1, \GH, \GH # the result is in GH + + +.endm + +.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 + + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 + + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_2_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_3_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_4_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_5_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_6_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_7_k(arg1) + + GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + vpshufd $0b01001110, \T5, \T1 + vpxor \T5, \T1, \T1 + vmovdqa \T1, HashKey_8_k(arg1) + +.endm + +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified + +.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC + i = (8-\num_initial_blocks) + setreg + + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_i, reg_i, reg_i +_get_AAD_loop\@: + vmovd (%r10), \T1 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i + + add $4, %r10 + sub $4, %r12 + jg _get_AAD_loop\@ + + + cmp $16, %r11 + je _get_AAD_loop2_done\@ + mov $16, %r12 + +_get_AAD_loop2\@: + vpsrldq $4, reg_i, reg_i + sub $4, %r12 + cmp %r11, %r12 + jg _get_AAD_loop2\@ + +_get_AAD_loop2_done\@: + + #byte-reflect the AAD data + vpshufb SHUF_MASK(%rip), reg_i, reg_i + + # initialize the data pointer offset as zero + xor %r11, %r11 + + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR + + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr + + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = 1 + setreg +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = (j+1) + setreg +.endr + + + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr + + + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg + GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here + + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 + + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants + +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 + + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr + + + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 + + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif + + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif + + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif + + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif + + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif + + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif + + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif + + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif + + add $128, %r11 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + +############################################################################### + +_initial_blocks_done\@: + +.endm + +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) + +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif + + + ####################################################################### + + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 + + ####################################################################### + + + + + + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + ####################################################################### + + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + + vpshufd $0b01001110, \T2, \T6 + vpxor \T2, \T6, \T6 + + vmovdqa HashKey_8_k(arg1), \T5 + vpclmulqdq $0x00, \T5, \T6, \T6 + + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_7_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + ####################################################################### + + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_6_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_5_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_4_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_3_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_2_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + ####################################################################### + + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpshufd $0b01001110, \T1, \T3 + vpxor \T1, \T3, \T3 + vmovdqa HashKey_k(arg1), \T5 + vpclmulqdq $0x10, \T5, \T3, \T3 + vpxor \T3, \T6, \T6 + + vpxor \T4, \T6, \T6 + vpxor \T7, \T6, \T6 + + vmovdqu 16*10(arg1), \T5 + + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### + + + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 + + + + ####################################################################### + #first phase of the reduction + ####################################################################### + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif + + ####################################################################### + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + ####################################################################### + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + + vpxor \T6, \XMM1, \XMM1 + + + +.endm + + +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + + ## Karatsuba Method + + + vpshufd $0b01001110, \XMM1, \T2 + vpxor \XMM1, \T2, \T2 + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + + vmovdqa HashKey_8_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM2, \T2 + vpxor \XMM2, \T2, \T2 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_7_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM3, \T2 + vpxor \XMM3, \T2, \T2 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_6_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM4, \T2 + vpxor \XMM4, \T2, \T2 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_5_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM5, \T2 + vpxor \XMM5, \T2, \T2 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_4_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM6, \T2 + vpxor \XMM6, \T2, \T2 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_3_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM7, \T2 + vpxor \XMM7, \T2, \T2 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_2_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vpshufd $0b01001110, \XMM8, \T2 + vpxor \XMM8, \T2, \T2 + vmovdqa HashKey(arg1), \T5 + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 + + vmovdqa HashKey_k(arg1), \T3 + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 + + + + + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of + # the accumulated carry-less multiplications + + ####################################################################### + #first phase of the reduction + vpslld $31, \T7, \T2 # packed right shifting << 31 + vpslld $30, \T7, \T3 # packed right shifting shift << 30 + vpslld $25, \T7, \T4 # packed right shifting shift << 25 + + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpsrldq $4, \T2, \T1 # shift-R T1 1 DW + + vpslldq $12, \T2, \T2 # shift-L T2 3 DWs + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + + + #second phase of the reduction + vpsrld $1, \T7, \T2 # packed left shifting >> 1 + vpsrld $2, \T7, \T3 # packed left shifting >> 2 + vpsrld $7, \T7, \T4 # packed left shifting >> 7 + vpxor \T3, \T2, \T2 # xor the shifted versions + vpxor \T4, \T2, \T2 + + vpxor \T1, \T2, \T2 + vpxor \T2, \T7, \T7 + vpxor \T7, \T6, \T6 # the result is in T6 + +.endm + + +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC_AVX ENC_DEC + + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ + + jmp _initial_num_blocks_is_1\@ + +_initial_num_blocks_is_7\@: + INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_6\@: + INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_5\@: + INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_4\@: + INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_3\@: + INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_2\@: + INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_1\@: + INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_0\@: + INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + + +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ + + sub $128, %r13 + je _eight_cipher_left\@ + + + + + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ + + + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ + +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + + + +_eight_cipher_left\@: + GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ + + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ + +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ + + vmovdqu TMP1(%rsp), %xmm1 + + sub $16, %r11 + +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to + # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif + + + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ + + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# + +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 + + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + + vpxor %xmm15, %xmm14, %xmm14 + GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + + vpxor %xmm14, %xmm9, %xmm9 + + + +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len + + cmp $16, %r11 + je _T_16\@ + + cmp $12, %r11 + je _T_12\@ + +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done\@ +_T_12\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + vpsrldq $8, %xmm9, %xmm9 + vmovd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done\@ + +_T_16\@: + vmovdqu %xmm9, (%r10) + +_return_T_done\@: + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + + +############################################################# +#void aesni_gcm_precomp_avx_gen2 +# (gcm_data *my_ctx_data, +# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +############################################################# +ENTRY(aesni_gcm_precomp_avx_gen2) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + vmovdqu (arg2), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + + + PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_precomp_avx_gen2) + +############################################################################### +#void aesni_gcm_enc_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +# const u8 *in, /* Plaintext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_enc_avx_gen2) + GCM_ENC_DEC_AVX ENC + ret +ENDPROC(aesni_gcm_enc_avx_gen2) + +############################################################################### +#void aesni_gcm_dec_avx_gen2( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +# const u8 *in, /* Ciphertext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_dec_avx_gen2) + GCM_ENC_DEC_AVX DEC + ret +ENDPROC(aesni_gcm_dec_avx_gen2) +#endif /* CONFIG_AS_AVX */ + +#ifdef CONFIG_AS_AVX2 +############################################################################### +# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +# Input: A and B (128-bits each, bit-reflected) +# Output: C = A*B*x mod poly, (i.e. >>1 ) +# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +############################################################################### +.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 + + vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 + vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 + vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 + vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 + vpxor \T3, \GH, \GH + + + vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs + vpslldq $8 , \GH, \GH # shift-L GH 2 DWs + + vpxor \T3, \T1, \T1 + vpxor \T2, \GH, \GH + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \GH, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L T2 2 DWs + + vpxor \T2, \GH, \GH # first phase of the reduction complete + ####################################################################### + #second phase of the reduction + vpclmulqdq $0x00, \GH, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \GH, \T3, \GH + vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \GH, \GH # second phase of the reduction complete + ####################################################################### + vpxor \T1, \GH, \GH # the result is in GH + + +.endm + +.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 + + # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vmovdqa \HK, \T5 + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly + vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly + vmovdqa \T5, HashKey_3(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly + vmovdqa \T5, HashKey_4(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly + vmovdqa \T5, HashKey_5(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly + vmovdqa \T5, HashKey_6(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly + vmovdqa \T5, HashKey_7(arg1) + + GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly + vmovdqa \T5, HashKey_8(arg1) + +.endm + + +## if a = number of total plaintext bytes +## b = floor(a/16) +## num_initial_blocks = b mod 4# +## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +## r10, r11, r12, rax are clobbered +## arg1, arg2, arg3, r14 are used as a pointer only, not modified + +.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER + i = (8-\num_initial_blocks) + setreg + + mov arg6, %r10 # r10 = AAD + mov arg7, %r12 # r12 = aadLen + + + mov %r12, %r11 + + vpxor reg_i, reg_i, reg_i +_get_AAD_loop\@: + vmovd (%r10), \T1 + vpslldq $12, \T1, \T1 + vpsrldq $4, reg_i, reg_i + vpxor \T1, reg_i, reg_i + + add $4, %r10 + sub $4, %r12 + jg _get_AAD_loop\@ + + + cmp $16, %r11 + je _get_AAD_loop2_done\@ + mov $16, %r12 + +_get_AAD_loop2\@: + vpsrldq $4, reg_i, reg_i + sub $4, %r12 + cmp %r11, %r12 + jg _get_AAD_loop2\@ + +_get_AAD_loop2_done\@: + + #byte-reflect the AAD data + vpshufb SHUF_MASK(%rip), reg_i, reg_i + + # initialize the data pointer offset as zero + xor %r11, %r11 + + # start AES for num_initial_blocks blocks + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), \CTR # CTR = Y0 + vpshufb SHUF_MASK(%rip), \CTR, \CTR + + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, reg_i + vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap + i = (i+1) + setreg +.endr + + vmovdqa (arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vpxor \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = 1 + setreg +.rep 9 + vmovdqa 16*j(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenc \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + j = (j+1) + setreg +.endr + + + vmovdqa 16*10(arg1), \T_key + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vaesenclast \T_key, reg_i, reg_i + i = (i+1) + setreg +.endr + + i = (9-\num_initial_blocks) + setreg +.rep \num_initial_blocks + vmovdqu (arg3, %r11), \T1 + vpxor \T1, reg_i, reg_i + vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for + # num_initial_blocks blocks + add $16, %r11 +.if \ENC_DEC == DEC + vmovdqa \T1, reg_i +.endif + vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations + i = (i+1) + setreg +.endr + + + i = (8-\num_initial_blocks) + j = (9-\num_initial_blocks) + setreg + GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6 + +.rep \num_initial_blocks + vpxor reg_i, reg_j, reg_j + GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks + i = (i+1) + j = (j+1) + setreg +.endr + # XMM8 has the combined result here + + vmovdqa \XMM8, TMP1(%rsp) + vmovdqa \XMM8, \T3 + + cmp $128, %r13 + jl _initial_blocks_done\@ # no need for precomputed constants + +############################################################################### +# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM1 + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM2 + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM3 + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM4 + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM5 + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM6 + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM7 + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + + vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 + vmovdqa \CTR, \XMM8 + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + vmovdqa (arg1), \T_key + vpxor \T_key, \XMM1, \XMM1 + vpxor \T_key, \XMM2, \XMM2 + vpxor \T_key, \XMM3, \XMM3 + vpxor \T_key, \XMM4, \XMM4 + vpxor \T_key, \XMM5, \XMM5 + vpxor \T_key, \XMM6, \XMM6 + vpxor \T_key, \XMM7, \XMM7 + vpxor \T_key, \XMM8, \XMM8 + + i = 1 + setreg +.rep 9 # do 9 rounds + vmovdqa 16*i(arg1), \T_key + vaesenc \T_key, \XMM1, \XMM1 + vaesenc \T_key, \XMM2, \XMM2 + vaesenc \T_key, \XMM3, \XMM3 + vaesenc \T_key, \XMM4, \XMM4 + vaesenc \T_key, \XMM5, \XMM5 + vaesenc \T_key, \XMM6, \XMM6 + vaesenc \T_key, \XMM7, \XMM7 + vaesenc \T_key, \XMM8, \XMM8 + i = (i+1) + setreg +.endr + + + vmovdqa 16*i(arg1), \T_key + vaesenclast \T_key, \XMM1, \XMM1 + vaesenclast \T_key, \XMM2, \XMM2 + vaesenclast \T_key, \XMM3, \XMM3 + vaesenclast \T_key, \XMM4, \XMM4 + vaesenclast \T_key, \XMM5, \XMM5 + vaesenclast \T_key, \XMM6, \XMM6 + vaesenclast \T_key, \XMM7, \XMM7 + vaesenclast \T_key, \XMM8, \XMM8 + + vmovdqu (arg3, %r11), \T1 + vpxor \T1, \XMM1, \XMM1 + vmovdqu \XMM1, (arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM1 + .endif + + vmovdqu 16*1(arg3, %r11), \T1 + vpxor \T1, \XMM2, \XMM2 + vmovdqu \XMM2, 16*1(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM2 + .endif + + vmovdqu 16*2(arg3, %r11), \T1 + vpxor \T1, \XMM3, \XMM3 + vmovdqu \XMM3, 16*2(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM3 + .endif + + vmovdqu 16*3(arg3, %r11), \T1 + vpxor \T1, \XMM4, \XMM4 + vmovdqu \XMM4, 16*3(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM4 + .endif + + vmovdqu 16*4(arg3, %r11), \T1 + vpxor \T1, \XMM5, \XMM5 + vmovdqu \XMM5, 16*4(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM5 + .endif + + vmovdqu 16*5(arg3, %r11), \T1 + vpxor \T1, \XMM6, \XMM6 + vmovdqu \XMM6, 16*5(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM6 + .endif + + vmovdqu 16*6(arg3, %r11), \T1 + vpxor \T1, \XMM7, \XMM7 + vmovdqu \XMM7, 16*6(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM7 + .endif + + vmovdqu 16*7(arg3, %r11), \T1 + vpxor \T1, \XMM8, \XMM8 + vmovdqu \XMM8, 16*7(arg2 , %r11) + .if \ENC_DEC == DEC + vmovdqa \T1, \XMM8 + .endif + + add $128, %r11 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with + # the corresponding ciphertext + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + +############################################################################### + +_initial_blocks_done\@: + + +.endm + + + +# encrypt 8 blocks at a time +# ghash the 8 previously encrypted ciphertext blocks +# arg1, arg2, arg3 are used as pointers only, not modified +# r11 is the data offset value +.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC + + vmovdqa \XMM1, \T2 + vmovdqa \XMM2, TMP2(%rsp) + vmovdqa \XMM3, TMP3(%rsp) + vmovdqa \XMM4, TMP4(%rsp) + vmovdqa \XMM5, TMP5(%rsp) + vmovdqa \XMM6, TMP6(%rsp) + vmovdqa \XMM7, TMP7(%rsp) + vmovdqa \XMM8, TMP8(%rsp) + +.if \loop_idx == in_order + vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONE(%rip), \XMM1, \XMM2 + vpaddd ONE(%rip), \XMM2, \XMM3 + vpaddd ONE(%rip), \XMM3, \XMM4 + vpaddd ONE(%rip), \XMM4, \XMM5 + vpaddd ONE(%rip), \XMM5, \XMM6 + vpaddd ONE(%rip), \XMM6, \XMM7 + vpaddd ONE(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +.else + vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT + vpaddd ONEf(%rip), \XMM1, \XMM2 + vpaddd ONEf(%rip), \XMM2, \XMM3 + vpaddd ONEf(%rip), \XMM3, \XMM4 + vpaddd ONEf(%rip), \XMM4, \XMM5 + vpaddd ONEf(%rip), \XMM5, \XMM6 + vpaddd ONEf(%rip), \XMM6, \XMM7 + vpaddd ONEf(%rip), \XMM7, \XMM8 + vmovdqa \XMM8, \CTR +.endif + + + ####################################################################### + + vmovdqu (arg1), \T1 + vpxor \T1, \XMM1, \XMM1 + vpxor \T1, \XMM2, \XMM2 + vpxor \T1, \XMM3, \XMM3 + vpxor \T1, \XMM4, \XMM4 + vpxor \T1, \XMM5, \XMM5 + vpxor \T1, \XMM6, \XMM6 + vpxor \T1, \XMM7, \XMM7 + vpxor \T1, \XMM8, \XMM8 + + ####################################################################### + + + + + + vmovdqu 16*1(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqu 16*2(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + ####################################################################### + + vmovdqa HashKey_8(arg1), \T5 + vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 + vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 + vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 + vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 + vpxor \T5, \T6, \T6 + + vmovdqu 16*3(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP2(%rsp), \T1 + vmovdqa HashKey_7(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*4(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + ####################################################################### + + vmovdqa TMP3(%rsp), \T1 + vmovdqa HashKey_6(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*5(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP4(%rsp), \T1 + vmovdqa HashKey_5(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*6(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + + vmovdqa TMP5(%rsp), \T1 + vmovdqa HashKey_4(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*7(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP6(%rsp), \T1 + vmovdqa HashKey_3(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vmovdqu 16*8(arg1), \T1 + vaesenc \T1, \XMM1, \XMM1 + vaesenc \T1, \XMM2, \XMM2 + vaesenc \T1, \XMM3, \XMM3 + vaesenc \T1, \XMM4, \XMM4 + vaesenc \T1, \XMM5, \XMM5 + vaesenc \T1, \XMM6, \XMM6 + vaesenc \T1, \XMM7, \XMM7 + vaesenc \T1, \XMM8, \XMM8 + + vmovdqa TMP7(%rsp), \T1 + vmovdqa HashKey_2(arg1), \T5 + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T4 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + + ####################################################################### + + vmovdqu 16*9(arg1), \T5 + vaesenc \T5, \XMM1, \XMM1 + vaesenc \T5, \XMM2, \XMM2 + vaesenc \T5, \XMM3, \XMM3 + vaesenc \T5, \XMM4, \XMM4 + vaesenc \T5, \XMM5, \XMM5 + vaesenc \T5, \XMM6, \XMM6 + vaesenc \T5, \XMM7, \XMM7 + vaesenc \T5, \XMM8, \XMM8 + + vmovdqa TMP8(%rsp), \T1 + vmovdqa HashKey(arg1), \T5 + + vpclmulqdq $0x00, \T5, \T1, \T3 + vpxor \T3, \T7, \T7 + + vpclmulqdq $0x01, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x10, \T5, \T1, \T3 + vpxor \T3, \T6, \T6 + + vpclmulqdq $0x11, \T5, \T1, \T3 + vpxor \T3, \T4, \T1 + + + vmovdqu 16*10(arg1), \T5 + + i = 0 + j = 1 + setreg +.rep 8 + vpxor 16*i(arg3, %r11), \T5, \T2 + .if \ENC_DEC == ENC + vaesenclast \T2, reg_j, reg_j + .else + vaesenclast \T2, reg_j, \T3 + vmovdqu 16*i(arg3, %r11), reg_j + vmovdqu \T3, 16*i(arg2, %r11) + .endif + i = (i+1) + j = (j+1) + setreg +.endr + ####################################################################### + + + vpslldq $8, \T6, \T3 # shift-L T3 2 DWs + vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs + vpxor \T3, \T7, \T7 + vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 + + + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + .if \ENC_DEC == ENC + vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer + vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer + .endif + + ####################################################################### + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T1, \T1 # the result is in T1 + + vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap + vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap + + + vpxor \T1, \XMM1, \XMM1 + + + +.endm + + +# GHASH the last 4 ciphertext blocks. +.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 + + ## Karatsuba Method + + vmovdqa HashKey_8(arg1), \T5 + + vpshufd $0b01001110, \XMM1, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM1, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM1, \T6 + vpclmulqdq $0x00, \T5, \XMM1, \T7 + + vpclmulqdq $0x00, \T3, \T2, \XMM1 + + ###################### + + vmovdqa HashKey_7(arg1), \T5 + vpshufd $0b01001110, \XMM2, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM2, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM2, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM2, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_6(arg1), \T5 + vpshufd $0b01001110, \XMM3, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM3, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM3, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM3, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_5(arg1), \T5 + vpshufd $0b01001110, \XMM4, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM4, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM4, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM4, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_4(arg1), \T5 + vpshufd $0b01001110, \XMM5, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM5, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM5, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM5, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_3(arg1), \T5 + vpshufd $0b01001110, \XMM6, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM6, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM6, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM6, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey_2(arg1), \T5 + vpshufd $0b01001110, \XMM7, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM7, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM7, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM7, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + + ###################### + + vmovdqa HashKey(arg1), \T5 + vpshufd $0b01001110, \XMM8, \T2 + vpshufd $0b01001110, \T5, \T3 + vpxor \XMM8, \T2, \T2 + vpxor \T5, \T3, \T3 + + vpclmulqdq $0x11, \T5, \XMM8, \T4 + vpxor \T4, \T6, \T6 + + vpclmulqdq $0x00, \T5, \XMM8, \T4 + vpxor \T4, \T7, \T7 + + vpclmulqdq $0x00, \T3, \T2, \T2 + + vpxor \T2, \XMM1, \XMM1 + vpxor \T6, \XMM1, \XMM1 + vpxor \T7, \XMM1, \T2 + + + + + vpslldq $8, \T2, \T4 + vpsrldq $8, \T2, \T2 + + vpxor \T4, \T7, \T7 + vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the + # accumulated carry-less multiplications + + ####################################################################### + #first phase of the reduction + vmovdqa POLY2(%rip), \T3 + + vpclmulqdq $0x01, \T7, \T3, \T2 + vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs + + vpxor \T2, \T7, \T7 # first phase of the reduction complete + ####################################################################### + + + #second phase of the reduction + vpclmulqdq $0x00, \T7, \T3, \T2 + vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) + + vpclmulqdq $0x10, \T7, \T3, \T4 + vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) + + vpxor \T2, \T4, \T4 # second phase of the reduction complete + ####################################################################### + vpxor \T4, \T6, \T6 # the result is in T6 +.endm + + + +# combined for GCM encrypt and decrypt functions +# clobbering all xmm registers +# clobbering r10, r11, r12, r13, r14, r15 +.macro GCM_ENC_DEC_AVX2 ENC_DEC + + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + + vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey + + mov arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # r13 = r13 - (r13 mod 16) + + mov %r13, %r12 + shr $4, %r12 + and $7, %r12 + jz _initial_num_blocks_is_0\@ + + cmp $7, %r12 + je _initial_num_blocks_is_7\@ + cmp $6, %r12 + je _initial_num_blocks_is_6\@ + cmp $5, %r12 + je _initial_num_blocks_is_5\@ + cmp $4, %r12 + je _initial_num_blocks_is_4\@ + cmp $3, %r12 + je _initial_num_blocks_is_3\@ + cmp $2, %r12 + je _initial_num_blocks_is_2\@ + + jmp _initial_num_blocks_is_1\@ + +_initial_num_blocks_is_7\@: + INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*7, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_6\@: + INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*6, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_5\@: + INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*5, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_4\@: + INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*4, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_3\@: + INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*3, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_2\@: + INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*2, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_1\@: + INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + sub $16*1, %r13 + jmp _initial_blocks_encrypted\@ + +_initial_num_blocks_is_0\@: + INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC + + +_initial_blocks_encrypted\@: + cmp $0, %r13 + je _zero_cipher_left\@ + + sub $128, %r13 + je _eight_cipher_left\@ + + + + + vmovd %xmm9, %r15d + and $255, %r15d + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + +_encrypt_by_8_new\@: + cmp $(255-8), %r15d + jg _encrypt_by_8\@ + + + + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + jmp _eight_cipher_left\@ + +_encrypt_by_8\@: + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $8, %r15b + GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + add $128, %r11 + sub $128, %r13 + jne _encrypt_by_8_new\@ + + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + + + + +_eight_cipher_left\@: + GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 + + +_zero_cipher_left\@: + cmp $16, arg4 + jl _only_less_than_16\@ + + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + sub $16, %r11 + add %r13, %r11 + vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer + # to be able to shift 16-r13 bytes + # (r13 is the number of bytes in plaintext mod 16) + vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask + vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes + jmp _final_ghash_mul\@ + +_only_less_than_16\@: + # check for 0 length + mov arg4, %r13 + and $15, %r13 # r13 = (arg4 mod 16) + + je _multiple_of_16_bytes\@ + + # handle the last <16 Byte block seperately + + + vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) + + + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 # adjust the shuffle mask pointer to be + # able to shift 16-r13 bytes (r13 is the + # number of bytes in plaintext mod 16) + +_get_last_16_byte_loop\@: + movb (arg3, %r11), %al + movb %al, TMP1 (%rsp , %r11) + add $1, %r11 + cmp %r13, %r11 + jne _get_last_16_byte_loop\@ + + vmovdqu TMP1(%rsp), %xmm1 + + sub $16, %r11 + +_final_ghash_mul\@: + .if \ENC_DEC == DEC + vmovdqa %xmm1, %xmm2 + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm2, %xmm2 + vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + .else + vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) + vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9 + vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 + vpxor %xmm9, %xmm14, %xmm14 + #GHASH computation for the last <16 Byte block + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 + sub %r13, %r11 + add $16, %r11 + vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext + .endif + + + ############################# + # output r13 Bytes + vmovq %xmm9, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left\@ + + mov %rax, (arg2 , %r11) + add $8, %r11 + vpsrldq $8, %xmm9, %xmm9 + vmovq %xmm9, %rax + sub $8, %r13 + +_less_than_8_bytes_left\@: + movb %al, (arg2 , %r11) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left\@ + ############################# + +_multiple_of_16_bytes\@: + mov arg7, %r12 # r12 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + vmovd %r12d, %xmm15 # len(A) in xmm15 + + shl $3, arg4 # len(C) in bits (*128) + vmovq arg4, %xmm1 + vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 + vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) + + vpxor %xmm15, %xmm14, %xmm14 + GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap + + mov arg5, %rax # rax = *Y0 + vmovdqu (%rax), %xmm9 # xmm9 = Y0 + + ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0) + + vpxor %xmm14, %xmm9, %xmm9 + + + +_return_T\@: + mov arg8, %r10 # r10 = authTag + mov arg9, %r11 # r11 = auth_tag_len + + cmp $16, %r11 + je _T_16\@ + + cmp $12, %r11 + je _T_12\@ + +_T_8\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + jmp _return_T_done\@ +_T_12\@: + vmovq %xmm9, %rax + mov %rax, (%r10) + vpsrldq $8, %xmm9, %xmm9 + vmovd %xmm9, %eax + mov %eax, 8(%r10) + jmp _return_T_done\@ + +_T_16\@: + vmovdqu %xmm9, (%r10) + +_return_T_done\@: + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 +.endm + + +############################################################# +#void aesni_gcm_precomp_avx_gen4 +# (gcm_data *my_ctx_data, +# u8 *hash_subkey)# /* H, the Hash sub key input. +# Data starts on a 16-byte boundary. */ +############################################################# +ENTRY(aesni_gcm_precomp_avx_gen4) + #the number of pushes must equal STACK_OFFSET + push %r12 + push %r13 + push %r14 + push %r15 + + mov %rsp, %r14 + + + + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + + vmovdqu (arg2), %xmm6 # xmm6 = HashKey + + vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 + ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey + vmovdqa %xmm6, %xmm2 + vpsllq $1, %xmm6, %xmm6 + vpsrlq $63, %xmm2, %xmm2 + vmovdqa %xmm2, %xmm1 + vpslldq $8, %xmm2, %xmm2 + vpsrldq $8, %xmm1, %xmm1 + vpor %xmm2, %xmm6, %xmm6 + #reduction + vpshufd $0b00100100, %xmm1, %xmm2 + vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 + vpand POLY(%rip), %xmm2, %xmm2 + vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly + ####################################################################### + vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly + + + PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 + + mov %r14, %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_precomp_avx_gen4) + + +############################################################################### +#void aesni_gcm_enc_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +# const u8 *in, /* Plaintext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_enc_avx_gen4) + GCM_ENC_DEC_AVX2 ENC + ret +ENDPROC(aesni_gcm_enc_avx_gen4) + +############################################################################### +#void aesni_gcm_dec_avx_gen4( +# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +# const u8 *in, /* Ciphertext input */ +# u64 plaintext_len, /* Length of data in Bytes for encryption. */ +# u8 *iv, /* Pre-counter block j0: 4 byte salt +# (from Security Association) concatenated with 8 byte +# Initialisation Vector (from IPSec ESP Payload) +# concatenated with 0x00000001. 16-byte aligned pointer. */ +# const u8 *aad, /* Additional Authentication Data (AAD)*/ +# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +# u8 *auth_tag, /* Authenticated Tag output. */ +# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +# Valid values are 16 (most likely), 12 or 8. */ +############################################################################### +ENTRY(aesni_gcm_dec_avx_gen4) + GCM_ENC_DEC_AVX2 DEC + ret +ENDPROC(aesni_gcm_dec_avx_gen4) + +#endif /* CONFIG_AS_AVX2 */ diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 835488b745ee..948ad0e77741 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -101,6 +101,9 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, int crypto_fpu_init(void); void crypto_fpu_exit(void); +#define AVX_GEN2_OPTSIZE 640 +#define AVX_GEN4_OPTSIZE 4096 + #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -150,6 +153,123 @@ asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, u8 *hash_subkey, const u8 *aad, unsigned long aad_len, u8 *auth_tag, unsigned long auth_tag_len); + +#ifdef CONFIG_AS_AVX +/* + * asmlinkage void aesni_gcm_precomp_avx_gen2() + * gcm_data *my_ctx_data, context data + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + */ +asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey); + +asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void aesni_gcm_enc_avx(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (plaintext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} + +static void aesni_gcm_dec_avx(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (ciphertext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} +#endif + +#ifdef CONFIG_AS_AVX2 +/* + * asmlinkage void aesni_gcm_precomp_avx_gen4() + * gcm_data *my_ctx_data, context data + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + */ +asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey); + +asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void aesni_gcm_enc_avx2(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (plaintext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_enc(ctx, out, in, plaintext_len, iv, hash_subkey, aad, + aad_len, auth_tag, auth_tag_len); + } else if (plaintext_len < AVX_GEN4_OPTSIZE) { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); + aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} + +static void aesni_gcm_dec_avx2(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len) +{ + if (ciphertext_len < AVX_GEN2_OPTSIZE) { + aesni_gcm_dec(ctx, out, in, ciphertext_len, iv, hash_subkey, + aad, aad_len, auth_tag, auth_tag_len); + } else if (ciphertext_len < AVX_GEN4_OPTSIZE) { + aesni_gcm_precomp_avx_gen2(ctx, hash_subkey); + aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } else { + aesni_gcm_precomp_avx_gen4(ctx, hash_subkey); + aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad, + aad_len, auth_tag, auth_tag_len); + } +} +#endif + +static void (*aesni_gcm_enc_tfm)(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static void (*aesni_gcm_dec_tfm)(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + static inline struct aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) { @@ -915,7 +1035,7 @@ static int __driver_rfc4106_encrypt(struct aead_request *req) dst = src; } - aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, + aesni_gcm_enc_tfm(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst + ((unsigned long)req->cryptlen), auth_tag_len); @@ -996,12 +1116,12 @@ static int __driver_rfc4106_decrypt(struct aead_request *req) dst = src; } - aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, + aesni_gcm_dec_tfm(aes_ctx, dst, src, tempCipherLen, iv, ctx->hash_subkey, assoc, (unsigned long)req->assoclen, authTag, auth_tag_len); /* Compare generated tag with passed in tag. */ - retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? + retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ? -EBADMSG : 0; if (one_entry_in_sg) { @@ -1353,6 +1473,27 @@ static int __init aesni_init(void) if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; +#ifdef CONFIG_X86_64 +#ifdef CONFIG_AS_AVX2 + if (boot_cpu_has(X86_FEATURE_AVX2)) { + pr_info("AVX2 version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc_avx2; + aesni_gcm_dec_tfm = aesni_gcm_dec_avx2; + } else +#endif +#ifdef CONFIG_AS_AVX + if (boot_cpu_has(X86_FEATURE_AVX)) { + pr_info("AVX version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc_avx; + aesni_gcm_dec_tfm = aesni_gcm_dec_avx; + } else +#endif + { + pr_info("SSE version of gcm_enc/dec engaged.\n"); + aesni_gcm_enc_tfm = aesni_gcm_enc; + aesni_gcm_dec_tfm = aesni_gcm_dec; + } +#endif err = crypto_fpu_init(); if (err) diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c index 50ec333b70e6..8af519ed73d1 100644 --- a/arch/x86/crypto/blowfish_glue.c +++ b/arch/x86/crypto/blowfish_glue.c @@ -223,9 +223,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * 4); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e6a3700489b9..e57e20ab5e0b 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -203,9 +203,6 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, src -= 1; dst -= 1; } while (nbytes >= bsize * CAST5_PARALLEL_BLOCKS); - - if (nbytes < bsize) - goto done; } /* Handle leftovers */ diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S index 586f41aac361..185fad49d86f 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_asm.S +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S @@ -24,10 +24,6 @@ .align 16 .Lbswap_mask: .octa 0x000102030405060708090a0b0c0d0e0f -.Lpoly: - .octa 0xc2000000000000000000000000000001 -.Ltwo_one: - .octa 0x00000001000000000000000000000001 #define DATA %xmm0 #define SHASH %xmm1 @@ -134,28 +130,3 @@ ENTRY(clmul_ghash_update) .Lupdate_just_ret: ret ENDPROC(clmul_ghash_update) - -/* - * void clmul_ghash_setkey(be128 *shash, const u8 *key); - * - * Calculate hash_key << 1 mod poly - */ -ENTRY(clmul_ghash_setkey) - movaps .Lbswap_mask, BSWAP - movups (%rsi), %xmm0 - PSHUFB_XMM BSWAP %xmm0 - movaps %xmm0, %xmm1 - psllq $1, %xmm0 - psrlq $63, %xmm1 - movaps %xmm1, %xmm2 - pslldq $8, %xmm1 - psrldq $8, %xmm2 - por %xmm1, %xmm0 - # reduction - pshufd $0b00100100, %xmm2, %xmm1 - pcmpeqd .Ltwo_one, %xmm1 - pand .Lpoly, %xmm1 - pxor %xmm1, %xmm0 - movups %xmm0, (%rdi) - ret -ENDPROC(clmul_ghash_setkey) diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 6759dd1135be..d785cf2c529c 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -30,8 +30,6 @@ void clmul_ghash_mul(char *dst, const be128 *shash); void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, const be128 *shash); -void clmul_ghash_setkey(be128 *shash, const u8 *key); - struct ghash_async_ctx { struct cryptd_ahash *cryptd_tfm; }; @@ -58,13 +56,23 @@ static int ghash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + be128 *x = (be128 *)key; + u64 a, b; if (keylen != GHASH_BLOCK_SIZE) { crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); return -EINVAL; } - clmul_ghash_setkey(&ctx->shash, key); + /* perform multiplication by 'x' in GF(2^128) */ + a = be64_to_cpu(x->a); + b = be64_to_cpu(x->b); + + ctx->shash.a = (__be64)((b << 1) | (a >> 63)); + ctx->shash.b = (__be64)((a << 1) | (b >> 63)); + + if (a >> 63) + ctx->shash.b ^= cpu_to_be64(0xc2); return 0; } diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S new file mode 100644 index 000000000000..1cd792db15ef --- /dev/null +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -0,0 +1,708 @@ +/* + * Implement fast SHA-1 with AVX2 instructions. (x86_64) + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2014 Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Contact Information: + * Ilya Albrekht <ilya.albrekht@intel.com> + * Maxim Locktyukhin <maxim.locktyukhin@intel.com> + * Ronen Zohar <ronen.zohar@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * + * BSD LICENSE + * + * Copyright(c) 2014 Intel Corporation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * SHA-1 implementation with Intel(R) AVX2 instruction set extensions. + * + *This implementation is based on the previous SSSE3 release: + *Visit http://software.intel.com/en-us/articles/ + *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/ + * + *Updates 20-byte SHA-1 record in 'hash' for even number of + *'num_blocks' consecutive 64-byte blocks + * + *extern "C" void sha1_transform_avx2( + * int *hash, const char* input, size_t num_blocks ); + */ + +#include <linux/linkage.h> + +#define CTX %rdi /* arg1 */ +#define BUF %rsi /* arg2 */ +#define CNT %rdx /* arg3 */ + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %eax +#define REG_E %edx +#define REG_TB %ebx +#define REG_TA %r12d +#define REG_RA %rcx +#define REG_RB %rsi +#define REG_RC %rdi +#define REG_RD %rax +#define REG_RE %rdx +#define REG_RTA %r12 +#define REG_RTB %rbx +#define REG_T1 %ebp +#define xmm_mov vmovups +#define avx2_zeroupper vzeroupper +#define RND_F1 1 +#define RND_F2 2 +#define RND_F3 3 + +.macro REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set TB, REG_TB + .set TA, REG_TA + + .set RA, REG_RA + .set RB, REG_RB + .set RC, REG_RC + .set RD, REG_RD + .set RE, REG_RE + + .set RTA, REG_RTA + .set RTB, REG_RTB + + .set T1, REG_T1 +.endm + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_PTR2 %r13 +#define BUFFER_END %r11 + +#define PRECALC_BUF %r14 +#define WK_BUF %r15 + +#define W_TMP %xmm0 +#define WY_TMP %ymm0 +#define WY_TMP2 %ymm9 + +# AVX2 variables +#define WY0 %ymm3 +#define WY4 %ymm5 +#define WY08 %ymm7 +#define WY12 %ymm8 +#define WY16 %ymm12 +#define WY20 %ymm13 +#define WY24 %ymm14 +#define WY28 %ymm15 + +#define YMM_SHUFB_BSWAP %ymm10 + +/* + * Keep 2 iterations precalculated at a time: + * - 80 DWORDs per iteration * 2 + */ +#define W_SIZE (80*2*2 +16) + +#define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF) +#define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF) + + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +.macro PRECALC_RESET_WY + .set WY_00, WY0 + .set WY_04, WY4 + .set WY_08, WY08 + .set WY_12, WY12 + .set WY_16, WY16 + .set WY_20, WY20 + .set WY_24, WY24 + .set WY_28, WY28 + .set WY_32, WY_00 +.endm + +.macro PRECALC_ROTATE_WY + /* Rotate macros */ + .set WY_32, WY_28 + .set WY_28, WY_24 + .set WY_24, WY_20 + .set WY_20, WY_16 + .set WY_16, WY_12 + .set WY_12, WY_08 + .set WY_08, WY_04 + .set WY_04, WY_00 + .set WY_00, WY_32 + + /* Define register aliases */ + .set WY, WY_00 + .set WY_minus_04, WY_04 + .set WY_minus_08, WY_08 + .set WY_minus_12, WY_12 + .set WY_minus_16, WY_16 + .set WY_minus_20, WY_20 + .set WY_minus_24, WY_24 + .set WY_minus_28, WY_28 + .set WY_minus_32, WY +.endm + +.macro PRECALC_00_15 + .if (i == 0) # Initialize and rotate registers + PRECALC_RESET_WY + PRECALC_ROTATE_WY + .endif + + /* message scheduling pre-compute for rounds 0-15 */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + .elseif ((i & 7) == 1) + vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY + .elseif ((i & 7) == 4) + vpaddd K_XMM(K_BASE), WY, WY_TMP + .elseif ((i & 7) == 7) + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_16_31 + /* + * message scheduling pre-compute for rounds 16-31 + * calculating last 32 w[i] values in 8 XMM registers + * pre-calculate K+w[i] values and store to mem + * for later load by ALU add instruction + * + * "brute force" vectorization for rounds 16-31 only + * due to w[i]->w[i-3] dependency + */ + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + /* w[i-14] */ + vpalignr $8, WY_minus_16, WY_minus_12, WY + vpsrldq $4, WY_minus_04, WY_TMP /* w[i-3] */ + .elseif ((i & 7) == 1) + vpxor WY_minus_08, WY, WY + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 2) + vpxor WY_TMP, WY, WY + vpslldq $12, WY, WY_TMP2 + .elseif ((i & 7) == 3) + vpslld $1, WY, WY_TMP + vpsrld $31, WY, WY + .elseif ((i & 7) == 4) + vpor WY, WY_TMP, WY_TMP + vpslld $2, WY_TMP2, WY + .elseif ((i & 7) == 5) + vpsrld $30, WY_TMP2, WY_TMP2 + vpxor WY, WY_TMP, WY_TMP + .elseif ((i & 7) == 7) + vpxor WY_TMP2, WY_TMP, WY + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC_32_79 + /* + * in SHA-1 specification: + * w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: + * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization + * since w[i]=>w[i-3] dependency is broken + */ + + .if ((i & 7) == 0) + /* + * blended AVX2 and ALU instruction scheduling + * 1 vector iteration per 8 rounds + */ + vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP + .elseif ((i & 7) == 1) + /* W is W_minus_32 before xor */ + vpxor WY_minus_28, WY, WY + .elseif ((i & 7) == 2) + vpxor WY_minus_16, WY_TMP, WY_TMP + .elseif ((i & 7) == 3) + vpxor WY_TMP, WY, WY + .elseif ((i & 7) == 4) + vpslld $2, WY, WY_TMP + .elseif ((i & 7) == 5) + vpsrld $30, WY, WY + vpor WY, WY_TMP, WY + .elseif ((i & 7) == 7) + vpaddd K_XMM(K_BASE), WY, WY_TMP + vmovdqu WY_TMP, PRECALC_WK(i&~7) + + PRECALC_ROTATE_WY + .endif +.endm + +.macro PRECALC r, s + .set i, \r + + .if (i < 40) + .set K_XMM, 32*0 + .elseif (i < 80) + .set K_XMM, 32*1 + .elseif (i < 120) + .set K_XMM, 32*2 + .else + .set K_XMM, 32*3 + .endif + + .if (i<32) + PRECALC_00_15 \s + .elseif (i<64) + PRECALC_16_31 \s + .elseif (i < 160) + PRECALC_32_79 \s + .endif +.endm + +.macro ROTATE_STATE + .set T_REG, E + .set E, D + .set D, C + .set C, B + .set B, TB + .set TB, A + .set A, T_REG + + .set T_REG, RE + .set RE, RD + .set RD, RC + .set RC, RB + .set RB, RTB + .set RTB, RA + .set RA, T_REG +.endm + +/* Macro relies on saved ROUND_Fx */ + +.macro RND_FUN f, r + .if (\f == RND_F1) + ROUND_F1 \r + .elseif (\f == RND_F2) + ROUND_F2 \r + .elseif (\f == RND_F3) + ROUND_F3 \r + .endif +.endm + +.macro RR r + .set round_id, (\r % 80) + + .if (round_id == 0) /* Precalculate F for first round */ + .set ROUND_FUNC, RND_F1 + mov B, TB + + rorx $(32-30), B, B /* b>>>2 */ + andn D, TB, T1 + and C, TB + xor T1, TB + .endif + + RND_FUN ROUND_FUNC, \r + ROTATE_STATE + + .if (round_id == 18) + .set ROUND_FUNC, RND_F2 + .elseif (round_id == 38) + .set ROUND_FUNC, RND_F3 + .elseif (round_id == 58) + .set ROUND_FUNC, RND_F2 + .endif + + .set round_id, ( (\r+1) % 80) + + RND_FUN ROUND_FUNC, (\r+1) + ROTATE_STATE +.endm + +.macro ROUND_F1 r + add WK(\r), E + + andn C, A, T1 /* ~b&d */ + lea (RE,RTB), E /* Add F from the previous round */ + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30),A, TB /* b>>>2 for next round */ + + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + /* + * Calculate F for the next round + * (b & c) ^ andn[b, d] + */ + and B, A /* b&c */ + xor T1, A /* F1 = (b&c) ^ (~b&d) */ + + lea (RE,RTA), E /* E += A >>> 5 */ +.endm + +.macro ROUND_F2 r + add WK(\r), E + lea (RE,RTB), E /* Add F from the previous round */ + + /* Calculate F for the next round */ + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + .if ((round_id) < 79) + rorx $(32-30), A, TB /* b>>>2 for next round */ + .endif + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + .if ((round_id) < 79) + xor B, A + .endif + + add TA, E /* E += A >>> 5 */ + + .if ((round_id) < 79) + xor C, A + .endif +.endm + +.macro ROUND_F3 r + add WK(\r), E + PRECALC (\r) /* msg scheduling for next 2 blocks */ + + lea (RE,RTB), E /* Add F from the previous round */ + + mov B, T1 + or A, T1 + + rorx $(32-5), A, TA /* T2 = A >>> 5 */ + rorx $(32-30), A, TB /* b>>>2 for next round */ + + /* Calculate F for the next round + * (b and c) or (d and (b or c)) + */ + and C, T1 + and B, A + or T1, A + + add TA, E /* E += A >>> 5 */ + +.endm + +/* + * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining + */ +.macro SHA1_PIPELINED_MAIN_BODY + + REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + mov %rsp, PRECALC_BUF + lea (2*4*80+32)(%rsp), WK_BUF + + # Precalc WK for first 2 blocks + PRECALC_OFFSET = 0 + .set i, 0 + .rept 160 + PRECALC i + .set i, i + 1 + .endr + PRECALC_OFFSET = 128 + xchg WK_BUF, PRECALC_BUF + + .align 32 +_loop: + /* + * code loops through more than one block + * we use K_BASE value as a signal of a last block, + * it is set below by: cmovae BUFFER_PTR, K_BASE + */ + cmp K_BASE, BUFFER_PTR + jne _begin + .align 32 + jmp _end + .align 32 +_begin: + + /* + * Do first block + * rounds: 0,2,4,6,8 + */ + .set j, 0 + .rept 5 + RR j + .set j, j+2 + .endr + + jmp _loop0 +_loop0: + + /* + * rounds: + * 10,12,14,16,18 + * 20,22,24,26,28 + * 30,32,34,36,38 + * 40,42,44,46,48 + * 50,52,54,56,58 + */ + .rept 25 + RR j + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ + cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + /* + * rounds + * 60,62,64,66,68 + * 70,72,74,76,78 + */ + .rept 10 + RR j + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + cmp K_BASE, BUFFER_PTR /* is current block the last one? */ + je _loop + + mov TB, B + + /* Process second block */ + /* + * rounds + * 0+80, 2+80, 4+80, 6+80, 8+80 + * 10+80,12+80,14+80,16+80,18+80 + */ + + .set j, 0 + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop1 +_loop1: + /* + * rounds + * 20+80,22+80,24+80,26+80,28+80 + * 30+80,32+80,34+80,36+80,38+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + jmp _loop2 +_loop2: + + /* + * rounds + * 40+80,42+80,44+80,46+80,48+80 + * 50+80,52+80,54+80,56+80,58+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ + + cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ + cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + + jmp _loop3 +_loop3: + + /* + * rounds + * 60+80,62+80,64+80,66+80,68+80 + * 70+80,72+80,74+80,76+80,78+80 + */ + .rept 10 + RR j+80 + .set j, j+2 + .endr + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), TB + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + /* Reset state for AVX2 reg permutation */ + mov A, TA + mov TB, A + mov C, TB + mov E, C + mov D, B + mov TA, D + + REGALLOC + + xchg WK_BUF, PRECALC_BUF + + jmp _loop + + .align 32 + _end: + +.endm +/* + * macro implements SHA-1 function's body for several 64-byte blocks + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + ENTRY(\name) + + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + RESERVE_STACK = (W_SIZE*4 + 8+24) + + /* Align stack */ + mov %rsp, %rbx + and $~(0x20-1), %rsp + push %rbx + sub $RESERVE_STACK, %rsp + + avx2_zeroupper + + lea K_XMM_AR(%rip), K_BASE + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + lea 64(BUF), BUFFER_PTR2 + + shl $6, CNT /* mul by 64 */ + add BUF, CNT + add $64, CNT + mov CNT, BUFFER_END + + cmp BUFFER_END, BUFFER_PTR2 + cmovae K_BASE, BUFFER_PTR2 + + xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + avx2_zeroupper + + add $RESERVE_STACK, %rsp + pop %rsp + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret + + ENDPROC(\name) +.endm + +.section .rodata + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.align 128 +K_XMM_AR: + .long K1, K1, K1, K1 + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f +.text + +SHA1_VECTOR_ASM sha1_transform_avx2 diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 4a11a9d72451..74d16ef707c7 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -10,6 +10,7 @@ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> * Copyright (c) Mathias Krause <minipli@googlemail.com> + * Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -39,6 +40,12 @@ asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, asmlinkage void sha1_transform_avx(u32 *digest, const char *data, unsigned int rounds); #endif +#ifdef CONFIG_AS_AVX2 +#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */ + +asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, + unsigned int rounds); +#endif static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); @@ -165,6 +172,18 @@ static int sha1_ssse3_import(struct shash_desc *desc, const void *in) return 0; } +#ifdef CONFIG_AS_AVX2 +static void sha1_apply_transform_avx2(u32 *digest, const char *data, + unsigned int rounds) +{ + /* Select the optimal transform based on data block size */ + if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE) + sha1_transform_avx2(digest, data, rounds); + else + sha1_transform_avx(digest, data, rounds); +} +#endif + static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_ssse3_init, @@ -201,27 +220,49 @@ static bool __init avx_usable(void) return true; } + +#ifdef CONFIG_AS_AVX2 +static bool __init avx2_usable(void) +{ + if (avx_usable() && cpu_has_avx2 && boot_cpu_has(X86_FEATURE_BMI1) && + boot_cpu_has(X86_FEATURE_BMI2)) + return true; + + return false; +} +#endif #endif static int __init sha1_ssse3_mod_init(void) { + char *algo_name; + /* test for SSSE3 first */ - if (cpu_has_ssse3) + if (cpu_has_ssse3) { sha1_transform_asm = sha1_transform_ssse3; + algo_name = "SSSE3"; + } #ifdef CONFIG_AS_AVX /* allow AVX to override SSSE3, it's a little faster */ - if (avx_usable()) + if (avx_usable()) { sha1_transform_asm = sha1_transform_avx; + algo_name = "AVX"; +#ifdef CONFIG_AS_AVX2 + /* allow AVX2 to override AVX, it's a little faster */ + if (avx2_usable()) { + sha1_transform_asm = sha1_apply_transform_avx2; + algo_name = "AVX2"; + } +#endif + } #endif if (sha1_transform_asm) { - pr_info("Using %s optimized SHA-1 implementation\n", - sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" - : "AVX"); + pr_info("Using %s optimized SHA-1 implementation\n", algo_name); return crypto_register_shash(&alg); } - pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + pr_info("Neither AVX nor AVX2 nor SSSE3 is available/usable.\n"); return -ENODEV; } |