From 0bd82f5f6355775fbaf7d3c664432ce1b862be1e Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Thu, 4 Nov 2010 15:00:45 -0400 Subject: crypto: aesni-intel - RFC4106 AES-GCM Driver Using Intel New Instructions This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit kernels. It supports 128-bit AES key size. This leverages the crypto AEAD interface type to facilitate a combined AES & GCM operation to be implemented in assembly code. The assembly code leverages Intel(R) AES New Instructions and the PCLMULQDQ instruction. Signed-off-by: Adrian Hoban Signed-off-by: Tadeusz Struk Signed-off-by: Gabriele Paoloni Signed-off-by: Aidan O'Mahony Signed-off-by: Erdinc Ozturk Signed-off-by: James Guilford Signed-off-by: Wajdi Feghali Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 1192 ++++++++++++++++++++++++++++++++++++ arch/x86/crypto/aesni-intel_glue.c | 518 +++++++++++++++- 2 files changed, 1708 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index ff16756a51c..aafced54df6 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -9,6 +9,17 @@ * Vinodh Gopal * Kahraman Akdemir * + * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD + * interface for 64-bit kernels. + * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) + * Aidan O'Mahony (aidan.o.mahony@intel.com) + * Adrian Hoban + * James Guilford (james.guilford@intel.com) + * Gabriele Paoloni + * Tadeusz Struk (tadeusz.struk@intel.com) + * Wajdi Feghali (wajdi.k.feghali@intel.com) + * Copyright (c) 2010, Intel Corporation. + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -18,8 +29,60 @@ #include #include +.data +POLY: .octa 0xC2000000000000000000000000000001 +TWOONE: .octa 0x00000001000000000000000000000001 + +# order of these constants should not change. +# more specifically, ALL_F should follow SHIFT_MASK, +# and ZERO should follow ALL_F + +SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +MASK1: .octa 0x0000000000000000ffffffffffffffff +MASK2: .octa 0xffffffffffffffff0000000000000000 +SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +ALL_F: .octa 0xffffffffffffffffffffffffffffffff +ZERO: .octa 0x00000000000000000000000000000000 +ONE: .octa 0x00000000000000000000000000000001 +F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 +dec: .octa 0x1 +enc: .octa 0x2 + + .text + +#define STACK_OFFSET 8*3 +#define HashKey 16*0 // store HashKey <<1 mod poly here +#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here +#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here +#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here +#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 + // bits of HashKey <<1 mod poly here + //(for Karatsuba purposes) +#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 + // bits of HashKey^2 <<1 mod poly here + // (for Karatsuba purposes) +#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 + // bits of HashKey^3 <<1 mod poly here + // (for Karatsuba purposes) +#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 + // bits of HashKey^4 <<1 mod poly here + // (for Karatsuba purposes) +#define VARIABLE_OFFSET 16*8 + +#define arg1 rdi +#define arg2 rsi +#define arg3 rdx +#define arg4 rcx +#define arg5 r8 +#define arg6 r9 +#define arg7 STACK_OFFSET+8(%r14) +#define arg8 STACK_OFFSET+16(%r14) +#define arg9 STACK_OFFSET+24(%r14) +#define arg10 STACK_OFFSET+32(%r14) + + #define STATE1 %xmm0 #define STATE2 %xmm4 #define STATE3 %xmm5 @@ -47,6 +110,1135 @@ #define T2 %r11 #define TCTR_LOW T2 + +/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +* +* +* Input: A and B (128-bits each, bit-reflected) +* Output: C = A*B*x mod poly, (i.e. >>1 ) +* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +* +*/ +.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 + movdqa \GH, \TMP1 + pshufd $78, \GH, \TMP2 + pshufd $78, \HK, \TMP3 + pxor \GH, \TMP2 # TMP2 = a1+a0 + pxor \HK, \TMP3 # TMP3 = b1+b0 + PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 + PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) + pxor \GH, \TMP2 + pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) + movdqa \TMP2, \TMP3 + pslldq $8, \TMP3 # left shift TMP3 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP3, \GH + pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK + + # first phase of the reduction + + movdqa \GH, \TMP2 + movdqa \GH, \TMP3 + movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 + # in in order to perform + # independent shifts + pslld $31, \TMP2 # packed right shift <<31 + pslld $30, \TMP3 # packed right shift <<30 + pslld $25, \TMP4 # packed right shift <<25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP5 + psrldq $4, \TMP5 # right shift TMP5 1 DW + pslldq $12, \TMP2 # left shift TMP2 3 DWs + pxor \TMP2, \GH + + # second phase of the reduction + + movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 + # in in order to perform + # independent shifts + movdqa \GH,\TMP3 + movdqa \GH,\TMP4 + psrld $1,\TMP2 # packed left shift >>1 + psrld $2,\TMP3 # packed left shift >>2 + psrld $7,\TMP4 # packed left shift >>7 + pxor \TMP3,\TMP2 # xor the shifted versions + pxor \TMP4,\TMP2 + pxor \TMP5, \TMP2 + pxor \TMP2, \GH + pxor \TMP1, \GH # result is in TMP1 +.endm + +/* +* if a = number of total plaintext bytes +* b = floor(a/16) +* num_initial_blocks = b mod 4 +* encrypt the initial num_initial_blocks blocks and apply ghash on +* the ciphertext +* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers +* are clobbered +* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified +*/ + +.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ +XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation + + mov arg7, %r10 # %r10 = AAD + mov arg8, %r12 # %r12 = aadLen + mov %r12, %r11 + pxor %xmm\i, %xmm\i +_get_AAD_loop\num_initial_blocks\operation: + movd (%r10), \TMP1 + pslldq $12, \TMP1 + psrldq $4, %xmm\i + pxor \TMP1, %xmm\i + add $4, %r10 + sub $4, %r12 + jne _get_AAD_loop\num_initial_blocks\operation + cmp $16, %r11 + je _get_AAD_loop2_done\num_initial_blocks\operation + mov $16, %r12 +_get_AAD_loop2\num_initial_blocks\operation: + psrldq $4, %xmm\i + sub $4, %r12 + cmp %r11, %r12 + jne _get_AAD_loop2\num_initial_blocks\operation +_get_AAD_loop2_done\num_initial_blocks\operation: + pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data + xor %r11, %r11 # initialise the data pointer offset as zero + + # start AES for num_initial_blocks blocks + + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), \XMM0 # XMM0 = Y0 + pshufb SHUF_MASK(%rip), \XMM0 +.if \i_seq != 0 +.irpc index, \i_seq + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, %xmm\index + pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap +.endr +.irpc index, \i_seq + pxor 16*0(%arg1), %xmm\index +.endr +.irpc index, \i_seq + movaps 0x10(%rdi), \TMP1 + AESENC \TMP1, %xmm\index # Round 1 +.endr +.irpc index, \i_seq + movaps 0x20(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x30(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x40(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x50(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x60(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x70(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x80(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x90(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0xa0(%arg1), \TMP1 + AESENCLAST \TMP1, %xmm\index # Round 10 +.endr +.irpc index, \i_seq + movdqu (%arg3 , %r11, 1), \TMP1 + pxor \TMP1, %xmm\index + movdqu %xmm\index, (%arg2 , %r11, 1) + # write back plaintext/ciphertext for num_initial_blocks + add $16, %r11 +.if \operation == dec + movdqa \TMP1, %xmm\index +.endif + pshufb SHUF_MASK(%rip), %xmm\index + # prepare plaintext/ciphertext for GHASH computation +.endr +.endif + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks + +.if \i == 5 + pxor %xmm5, %xmm6 + GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 6 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 7 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.endif + cmp $64, %r13 + jl _initial_blocks_done\num_initial_blocks\operation + # no need for precomputed values +/* +* +* Precomputations for HashKey parallel with encryption of first 4 blocks. +* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +*/ + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM1 + pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM2 + pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM3 + pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM4 + pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap + pxor 16*0(%arg1), \XMM1 + pxor 16*0(%arg1), \XMM2 + pxor 16*0(%arg1), \XMM3 + pxor 16*0(%arg1), \XMM4 + movdqa \TMP3, \TMP5 + pshufd $78, \TMP3, \TMP1 + pxor \TMP3, \TMP1 + movdqa \TMP1, HashKey_k(%rsp) + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^2<<1 (mod poly) + movdqa \TMP5, HashKey_2(%rsp) +# HashKey_2 = HashKey^2<<1 (mod poly) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_2_k(%rsp) +.irpc index, 1234 # do 4 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_3(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_3_k(%rsp) +.irpc index, 56789 # do next 5 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_4(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_4_k(%rsp) + movaps 0xa0(%arg1), \TMP2 + AESENCLAST \TMP2, \XMM1 + AESENCLAST \TMP2, \XMM2 + AESENCLAST \TMP2, \XMM3 + AESENCLAST \TMP2, \XMM4 + movdqu 16*0(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM1 +.if \operation == dec + movdqu \XMM1, 16*0(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM1 +.endif + movdqu 16*1(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM2 +.if \operation == dec + movdqu \XMM2, 16*1(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM2 +.endif + movdqu 16*2(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM3 +.if \operation == dec + movdqu \XMM3, 16*2(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM3 +.endif + movdqu 16*3(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM4 +.if \operation == dec + movdqu \XMM4, 16*3(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM4 +.else + movdqu \XMM1, 16*0(%arg2 , %r11 , 1) + movdqu \XMM2, 16*1(%arg2 , %r11 , 1) + movdqu \XMM3, 16*2(%arg2 , %r11 , 1) + movdqu \XMM4, 16*3(%arg2 , %r11 , 1) +.endif + add $64, %r11 + pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap + pxor \XMMDst, \XMM1 +# combine GHASHed value with the corresponding ciphertext + pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap + pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap + pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap +_initial_blocks_done\num_initial_blocks\operation: +.endm + +/* +* encrypt 4 blocks at a time +* ghash the 4 previously encrypted ciphertext blocks +* arg1, %arg2, %arg3 are used as pointers only, not modified +* %r11 is the data offset value +*/ +.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \ +TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation + + movdqa \XMM1, \XMM5 + movdqa \XMM2, \XMM6 + movdqa \XMM3, \XMM7 + movdqa \XMM4, \XMM8 + + # multiply TMP5 * HashKey using karatsuba + + movdqa \XMM5, \TMP4 + pshufd $78, \XMM5, \TMP6 + pxor \XMM5, \TMP6 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa HashKey_4(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + movdqa \XMM0, \XMM1 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM2 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM3 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM4 + pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap + PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap + pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap + pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap + pxor (%arg1), \XMM1 + pxor (%arg1), \XMM2 + pxor (%arg1), \XMM3 + pxor (%arg1), \XMM4 + movdqa HashKey_4_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + movaps 0x10(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movaps 0x20(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 2 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movdqa \XMM6, \TMP1 + pshufd $78, \XMM6, \TMP2 + pxor \XMM6, \TMP2 + movdqa HashKey_3(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + movaps 0x30(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 3 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + movaps 0x40(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 4 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_3_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x50(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 5 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM6, \XMM5 + pxor \TMP2, \TMP6 + movdqa \XMM7, \TMP1 + pshufd $78, \XMM7, \TMP2 + pxor \XMM7, \TMP2 + movdqa HashKey_2(%rsp ), \TMP5 + + # Multiply TMP5 * HashKey using karatsuba + + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x60(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 6 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + movaps 0x70(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 7 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_2_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x80(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 8 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM7, \XMM5 + pxor \TMP2, \TMP6 + + # Multiply XMM8 * HashKey + # XMM8 and TMP5 hold the values for the two operands + + movdqa \XMM8, \TMP1 + pshufd $78, \XMM8, \TMP2 + pxor \XMM8, \TMP2 + movdqa HashKey(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x90(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 9 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + movaps 0xa0(%arg1), \TMP3 + AESENCLAST \TMP3, \XMM1 # Round 10 + AESENCLAST \TMP3, \XMM2 + AESENCLAST \TMP3, \XMM3 + AESENCLAST \TMP3, \XMM4 + movdqa HashKey_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movdqu (%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK +.if \operation == dec + movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM1 +.endif + movdqu 16(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK +.if \operation == dec + movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM2 +.endif + movdqu 32(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK +.if \operation == dec + movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM3 +.endif + movdqu 48(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK +.if \operation == dec + movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM4 +.else + movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer +.endif + pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap + pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap + pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap + pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway + + pxor \TMP4, \TMP1 + pxor \XMM8, \XMM5 + pxor \TMP6, \TMP2 + pxor \TMP1, \TMP2 + pxor \XMM5, \TMP2 + movdqa \TMP2, \TMP3 + pslldq $8, \TMP3 # left shift TMP3 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP3, \XMM5 + pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 + + # first phase of reduction + + movdqa \XMM5, \TMP2 + movdqa \XMM5, \TMP3 + movdqa \XMM5, \TMP4 +# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently + pslld $31, \TMP2 # packed right shift << 31 + pslld $30, \TMP3 # packed right shift << 30 + pslld $25, \TMP4 # packed right shift << 25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP5 + psrldq $4, \TMP5 # right shift T5 1 DW + pslldq $12, \TMP2 # left shift T2 3 DWs + pxor \TMP2, \XMM5 + + # second phase of reduction + + movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 + movdqa \XMM5,\TMP3 + movdqa \XMM5,\TMP4 + psrld $1, \TMP2 # packed left shift >>1 + psrld $2, \TMP3 # packed left shift >>2 + psrld $7, \TMP4 # packed left shift >>7 + pxor \TMP3,\TMP2 # xor the shifted versions + pxor \TMP4,\TMP2 + pxor \TMP5, \TMP2 + pxor \TMP2, \XMM5 + pxor \TMP1, \XMM5 # result is in TMP1 + + pxor \XMM5, \XMM1 +.endm + +/* GHASH the last 4 ciphertext blocks. */ +.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ +TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst + + # Multiply TMP6 * HashKey (using Karatsuba) + + movdqa \XMM1, \TMP6 + pshufd $78, \XMM1, \TMP2 + pxor \XMM1, \TMP2 + movdqa HashKey_4(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 + movdqa HashKey_4_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movdqa \XMM1, \XMMDst + movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 + + # Multiply TMP1 * HashKey (using Karatsuba) + + movdqa \XMM2, \TMP1 + pshufd $78, \XMM2, \TMP2 + pxor \XMM2, \TMP2 + movdqa HashKey_3(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 + movdqa HashKey_3_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pxor \TMP1, \TMP6 + pxor \XMM2, \XMMDst + pxor \TMP2, \XMM1 +# results accumulated in TMP6, XMMDst, XMM1 + + # Multiply TMP1 * HashKey (using Karatsuba) + + movdqa \XMM3, \TMP1 + pshufd $78, \XMM3, \TMP2 + pxor \XMM3, \TMP2 + movdqa HashKey_2(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 + movdqa HashKey_2_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pxor \TMP1, \TMP6 + pxor \XMM3, \XMMDst + pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 + + # Multiply TMP1 * HashKey (using Karatsuba) + movdqa \XMM4, \TMP1 + pshufd $78, \XMM4, \TMP2 + pxor \XMM4, \TMP2 + movdqa HashKey(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 + movdqa HashKey_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pxor \TMP1, \TMP6 + pxor \XMM4, \XMMDst + pxor \XMM1, \TMP2 + pxor \TMP6, \TMP2 + pxor \XMMDst, \TMP2 + # middle section of the temp results combined as in karatsuba algorithm + movdqa \TMP2, \TMP4 + pslldq $8, \TMP4 # left shift TMP4 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP4, \XMMDst + pxor \TMP2, \TMP6 +# TMP6:XMMDst holds the result of the accumulated carry-less multiplications + # first phase of the reduction + movdqa \XMMDst, \TMP2 + movdqa \XMMDst, \TMP3 + movdqa \XMMDst, \TMP4 +# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently + pslld $31, \TMP2 # packed right shifting << 31 + pslld $30, \TMP3 # packed right shifting << 30 + pslld $25, \TMP4 # packed right shifting << 25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP7 + psrldq $4, \TMP7 # right shift TMP7 1 DW + pslldq $12, \TMP2 # left shift TMP2 3 DWs + pxor \TMP2, \XMMDst + + # second phase of the reduction + movdqa \XMMDst, \TMP2 + # make 3 copies of XMMDst for doing 3 shift operations + movdqa \XMMDst, \TMP3 + movdqa \XMMDst, \TMP4 + psrld $1, \TMP2 # packed left shift >> 1 + psrld $2, \TMP3 # packed left shift >> 2 + psrld $7, \TMP4 # packed left shift >> 7 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + pxor \TMP7, \TMP2 + pxor \TMP2, \XMMDst + pxor \TMP6, \XMMDst # reduced result is in XMMDst +.endm + +/* Encryption of a single block done*/ +.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 + + pxor (%arg1), \XMM0 + movaps 16(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 32(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 48(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 64(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 80(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 96(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 112(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 128(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 144(%arg1), \TMP1 + AESENC \TMP1, \XMM0 + movaps 160(%arg1), \TMP1 + AESENCLAST \TMP1, \XMM0 +.endm + + +/***************************************************************************** +* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* u8 *out, // Plaintext output. Encrypt in-place is allowed. +* const u8 *in, // Ciphertext input +* u64 plaintext_len, // Length of data in bytes for decryption. +* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) +* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) +* // concatenated with 0x00000001. 16-byte aligned pointer. +* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. +* const u8 *aad, // Additional Authentication Data (AAD) +* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes +* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the +* // given authentication tag and only return the plaintext if they match. +* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 +* // (most likely), 12 or 8. +* +* Assumptions: +* +* keys: +* keys are pre-expanded and aligned to 16 bytes. we are using the first +* set of 11 keys in the data structure void *aes_ctx +* +* iv: +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Salt (From the SA) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Initialization Vector | +* | (This is the sequence number from IPSec header) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x1 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* +* +* AAD: +* AAD padded to 128 bits with 0 +* for example, assume AAD is a u32 vector +* +* if AAD is 8 bytes: +* AAD[3] = {A0, A1}; +* padded AAD in xmm register = {A1 A0 0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A1) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 32-bit Sequence Number (A0) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 32-bit Sequence Number +* +* if AAD is 12 bytes: +* AAD[3] = {A0, A1, A2}; +* padded AAD in xmm register = {A2 A1 A0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A2) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 64-bit Extended Sequence Number {A1,A0} | +* | | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 64-bit Extended Sequence Number +* +* aadLen: +* from the definition of the spec, aadLen can only be 8 or 12 bytes. +* The code supports 16 too but for other sizes, the code will fail. +* +* TLen: +* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +* For other sizes, the code will fail. +* +* poly = x^128 + x^127 + x^126 + x^121 + 1 +* +*****************************************************************************/ + +ENTRY(aesni_gcm_dec) + push %r12 + push %r13 + push %r14 + mov %rsp, %r14 +/* +* states of %xmm registers %xmm6:%xmm15 not saved +* all %xmm registers are clobbered +*/ + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + mov %arg6, %r12 + movdqu (%r12), %xmm13 # %xmm13 = HashKey + pshufb SHUF_MASK(%rip), %xmm13 + +# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) + + movdqa %xmm13, %xmm2 + psllq $1, %xmm13 + psrlq $63, %xmm2 + movdqa %xmm2, %xmm1 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + por %xmm2, %xmm13 + + # Reduction + + pshufd $0x24, %xmm1, %xmm2 + pcmpeqd TWOONE(%rip), %xmm2 + pand POLY(%rip), %xmm2 + pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) + + + # Decrypt first few blocks + + movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) + mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) + mov %r13, %r12 + and $(3<<4), %r12 + jz _initial_num_blocks_is_0_decrypt + cmp $(2<<4), %r12 + jb _initial_num_blocks_is_1_decrypt + je _initial_num_blocks_is_2_decrypt +_initial_num_blocks_is_3_decrypt: + INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec + sub $48, %r13 + jmp _initial_blocks_decrypted +_initial_num_blocks_is_2_decrypt: + INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec + sub $32, %r13 + jmp _initial_blocks_decrypted +_initial_num_blocks_is_1_decrypt: + INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec + sub $16, %r13 + jmp _initial_blocks_decrypted +_initial_num_blocks_is_0_decrypt: + INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec +_initial_blocks_decrypted: + cmp $0, %r13 + je _zero_cipher_left_decrypt + sub $64, %r13 + je _four_cipher_left_decrypt +_decrypt_by_4: + GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ +%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec + add $64, %r11 + sub $64, %r13 + jne _decrypt_by_4 +_four_cipher_left_decrypt: + GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ +%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 +_zero_cipher_left_decrypt: + mov %arg4, %r13 + and $15, %r13 # %r13 = arg4 (mod 16) + je _multiple_of_16_bytes_decrypt + + # Handle the last <16 byte block seperately + + paddd ONE(%rip), %xmm0 # increment CNT to get Yn + pshufb SHUF_MASK(%rip), %xmm0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) + sub $16, %r11 + add %r13, %r11 + movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 +# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes +# (%r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 # get the appropriate shuffle mask + pshufb %xmm2, %xmm1 # right shift 16-%r13 butes + movdqa %xmm1, %xmm2 + pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) + movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 + pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 + pand %xmm1, %xmm2 + pshufb SHUF_MASK(%rip),%xmm2 + pxor %xmm2, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # GHASH computation for the last <16 byte block + sub %r13, %r11 + add $16, %r11 + + # output %r13 bytes + movq %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_decrypt + mov %rax, (%arg2 , %r11, 1) + add $8, %r11 + psrldq $8, %xmm0 + movq %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_decrypt: + mov %al, (%arg2, %r11, 1) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_decrypt +_multiple_of_16_bytes_decrypt: + mov arg8, %r12 # %r13 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + movd %r12d, %xmm15 # len(A) in %xmm15 + shl $3, %arg4 # len(C) in bits (*128) + movq %arg4, %xmm1 + pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 + pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) + pxor %xmm15, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # final GHASH computation + pshufb SHUF_MASK(%rip), %xmm8 + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), %xmm0 # %xmm0 = Y0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) + pxor %xmm8, %xmm0 +_return_T_decrypt: + mov arg9, %r10 # %r10 = authTag + mov arg10, %r11 # %r11 = auth_tag_len + cmp $16, %r11 + je _T_16_decrypt + cmp $12, %r11 + je _T_12_decrypt +_T_8_decrypt: + movq %xmm0, %rax + mov %rax, (%r10) + jmp _return_T_done_decrypt +_T_12_decrypt: + movq %xmm0, %rax + mov %rax, (%r10) + psrldq $8, %xmm0 + movd %xmm0, %eax + mov %eax, 8(%r10) + jmp _return_T_done_decrypt +_T_16_decrypt: + movdqu %xmm0, (%r10) +_return_T_done_decrypt: + mov %r14, %rsp + pop %r14 + pop %r13 + pop %r12 + ret + + +/***************************************************************************** +* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* u8 *out, // Ciphertext output. Encrypt in-place is allowed. +* const u8 *in, // Plaintext input +* u64 plaintext_len, // Length of data in bytes for encryption. +* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) +* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) +* // concatenated with 0x00000001. 16-byte aligned pointer. +* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. +* const u8 *aad, // Additional Authentication Data (AAD) +* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes +* u8 *auth_tag, // Authenticated Tag output. +* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), +* // 12 or 8. +* +* Assumptions: +* +* keys: +* keys are pre-expanded and aligned to 16 bytes. we are using the +* first set of 11 keys in the data structure void *aes_ctx +* +* +* iv: +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Salt (From the SA) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Initialization Vector | +* | (This is the sequence number from IPSec header) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x1 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* +* +* AAD: +* AAD padded to 128 bits with 0 +* for example, assume AAD is a u32 vector +* +* if AAD is 8 bytes: +* AAD[3] = {A0, A1}; +* padded AAD in xmm register = {A1 A0 0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A1) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 32-bit Sequence Number (A0) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 32-bit Sequence Number +* +* if AAD is 12 bytes: +* AAD[3] = {A0, A1, A2}; +* padded AAD in xmm register = {A2 A1 A0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A2) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 64-bit Extended Sequence Number {A1,A0} | +* | | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 64-bit Extended Sequence Number +* +* aadLen: +* from the definition of the spec, aadLen can only be 8 or 12 bytes. +* The code supports 16 too but for other sizes, the code will fail. +* +* TLen: +* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +* For other sizes, the code will fail. +* +* poly = x^128 + x^127 + x^126 + x^121 + 1 +***************************************************************************/ +ENTRY(aesni_gcm_enc) + push %r12 + push %r13 + push %r14 + mov %rsp, %r14 +# +# states of %xmm registers %xmm6:%xmm15 not saved +# all %xmm registers are clobbered +# + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp + mov %arg6, %r12 + movdqu (%r12), %xmm13 + pshufb SHUF_MASK(%rip), %xmm13 + +# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) + + movdqa %xmm13, %xmm2 + psllq $1, %xmm13 + psrlq $63, %xmm2 + movdqa %xmm2, %xmm1 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + por %xmm2, %xmm13 + + # reduce HashKey<<1 + + pshufd $0x24, %xmm1, %xmm2 + pcmpeqd TWOONE(%rip), %xmm2 + pand POLY(%rip), %xmm2 + pxor %xmm2, %xmm13 + movdqa %xmm13, HashKey(%rsp) + mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) + and $-16, %r13 + mov %r13, %r12 + + # Encrypt first few blocks + + and $(3<<4), %r12 + jz _initial_num_blocks_is_0_encrypt + cmp $(2<<4), %r12 + jb _initial_num_blocks_is_1_encrypt + je _initial_num_blocks_is_2_encrypt +_initial_num_blocks_is_3_encrypt: + INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc + sub $48, %r13 + jmp _initial_blocks_encrypted +_initial_num_blocks_is_2_encrypt: + INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc + sub $32, %r13 + jmp _initial_blocks_encrypted +_initial_num_blocks_is_1_encrypt: + INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc + sub $16, %r13 + jmp _initial_blocks_encrypted +_initial_num_blocks_is_0_encrypt: + INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc +_initial_blocks_encrypted: + + # Main loop - Encrypt remaining blocks + + cmp $0, %r13 + je _zero_cipher_left_encrypt + sub $64, %r13 + je _four_cipher_left_encrypt +_encrypt_by_4_encrypt: + GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ +%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc + add $64, %r11 + sub $64, %r13 + jne _encrypt_by_4_encrypt +_four_cipher_left_encrypt: + GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ +%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 +_zero_cipher_left_encrypt: + mov %arg4, %r13 + and $15, %r13 # %r13 = arg4 (mod 16) + je _multiple_of_16_bytes_encrypt + + # Handle the last <16 Byte block seperately + paddd ONE(%rip), %xmm0 # INCR CNT to get Yn + pshufb SHUF_MASK(%rip), %xmm0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) + sub $16, %r11 + add %r13, %r11 + movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 + # adjust the shuffle mask pointer to be able to shift 16-r13 bytes + # (%r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 # get the appropriate shuffle mask + pshufb %xmm2, %xmm1 # shift right 16-r13 byte + pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) + movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out top 16-r13 bytes of xmm0 + pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 + + pshufb SHUF_MASK(%rip),%xmm0 + pxor %xmm0, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # GHASH computation for the last <16 byte block + sub %r13, %r11 + add $16, %r11 + pshufb SHUF_MASK(%rip), %xmm0 + # shuffle xmm0 back to output as ciphertext + + # Output %r13 bytes + movq %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_encrypt + mov %rax, (%arg2 , %r11, 1) + add $8, %r11 + psrldq $8, %xmm0 + movq %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_encrypt: + mov %al, (%arg2, %r11, 1) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_encrypt +_multiple_of_16_bytes_encrypt: + mov arg8, %r12 # %r12 = addLen (number of bytes) + shl $3, %r12 + movd %r12d, %xmm15 # len(A) in %xmm15 + shl $3, %arg4 # len(C) in bits (*128) + movq %arg4, %xmm1 + pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 + pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) + pxor %xmm15, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # final GHASH computation + + pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), %xmm0 # %xmm0 = Y0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) + pxor %xmm8, %xmm0 +_return_T_encrypt: + mov arg9, %r10 # %r10 = authTag + mov arg10, %r11 # %r11 = auth_tag_len + cmp $16, %r11 + je _T_16_encrypt + cmp $12, %r11 + je _T_12_encrypt +_T_8_encrypt: + movq %xmm0, %rax + mov %rax, (%r10) + jmp _return_T_done_encrypt +_T_12_encrypt: + movq %xmm0, %rax + mov %rax, (%r10) + psrldq $8, %xmm0 + movd %xmm0, %eax + mov %eax, 8(%r10) + jmp _return_T_done_encrypt +_T_16_encrypt: + movdqu %xmm0, (%r10) +_return_T_done_encrypt: + mov %r14, %rsp + pop %r14 + pop %r13 + pop %r12 + ret + + + _key_expansion_128: _key_expansion_256a: pshufd $0b11111111, %xmm1, %xmm1 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2cb3dcc4490..02d349d6442 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -5,6 +5,14 @@ * Copyright (C) 2008, Intel Corp. * Author: Huang Ying * + * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD + * interface for 64-bit kernels. + * Authors: Adrian Hoban + * Gabriele Paoloni + * Tadeusz Struk (tadeusz.struk@intel.com) + * Aidan O'Mahony (aidan.o.mahony@intel.com) + * Copyright (c) 2010, Intel Corporation. + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -21,6 +29,10 @@ #include #include #include +#include +#include +#include +#include #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) #define HAS_CTR @@ -42,8 +54,31 @@ struct async_aes_ctx { struct cryptd_ablkcipher *cryptd_tfm; }; -#define AESNI_ALIGN 16 +/* This data is stored at the end of the crypto_tfm struct. + * It's a type of per "session" data storage location. + * This needs to be 16 byte aligned. + */ +struct aesni_rfc4106_gcm_ctx { + u8 hash_subkey[16]; + struct crypto_aes_ctx aes_key_expanded; + u8 nonce[4]; + struct cryptd_aead *cryptd_tfm; +}; + +struct aesni_gcm_set_hash_subkey_result { + int err; + struct completion completion; +}; + +struct aesni_hash_subkey_req_data { + u8 iv[16]; + struct aesni_gcm_set_hash_subkey_result result; + struct scatterlist sg; +}; + +#define AESNI_ALIGN (16) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) +#define RFC4106_HASH_SUBKEY_SIZE 16 asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); @@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +/* asmlinkage void aesni_gcm_enc() + * void *ctx, AES Key schedule. Starts on a 16 byte boundary. + * u8 *out, Ciphertext output. Encrypt in-place is allowed. + * const u8 *in, Plaintext input + * unsigned long plaintext_len, Length of data in bytes for encryption. + * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) + * concatenated with 8 byte Initialisation Vector (from IPSec ESP + * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + * const u8 *aad, Additional Authentication Data (AAD) + * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this + * is going to be 8 or 12 bytes + * u8 *auth_tag, Authenticated Tag output. + * unsigned long auth_tag_len), Authenticated Tag Length in bytes. + * Valid values are 16 (most likely), 12 or 8. + */ +asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +/* asmlinkage void aesni_gcm_dec() + * void *ctx, AES Key schedule. Starts on a 16 byte boundary. + * u8 *out, Plaintext output. Decrypt in-place is allowed. + * const u8 *in, Ciphertext input + * unsigned long ciphertext_len, Length of data in bytes for decryption. + * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) + * concatenated with 8 byte Initialisation Vector (from IPSec ESP + * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + * const u8 *aad, Additional Authentication Data (AAD) + * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going + * to be 8 or 12 bytes + * u8 *auth_tag, Authenticated Tag output. + * unsigned long auth_tag_len) Authenticated Tag Length in bytes. + * Valid values are 16 (most likely), 12 or 8. + */ +asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static inline struct +aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) +{ + return + (struct aesni_rfc4106_gcm_ctx *) + PTR_ALIGN((u8 *) + crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN); +} + static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { unsigned long addr = (unsigned long)raw_ctx; @@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = { }; #endif +static int rfc4106_init(struct crypto_tfm *tfm) +{ + struct cryptd_aead *cryptd_tfm; + struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) + PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); + cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_aead.reqsize = sizeof(struct aead_request) + + crypto_aead_reqsize(&cryptd_tfm->base); + return 0; +} + +static void rfc4106_exit(struct crypto_tfm *tfm) +{ + struct aesni_rfc4106_gcm_ctx *ctx = + (struct aesni_rfc4106_gcm_ctx *) + PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); + if (!IS_ERR(ctx->cryptd_tfm)) + cryptd_free_aead(ctx->cryptd_tfm); + return; +} + +static void +rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err) +{ + struct aesni_gcm_set_hash_subkey_result *result = req->data; + + if (err == -EINPROGRESS) + return; + result->err = err; + complete(&result->completion); +} + +static int +rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) +{ + struct crypto_ablkcipher *ctr_tfm; + struct ablkcipher_request *req; + int ret = -EINVAL; + struct aesni_hash_subkey_req_data *req_data; + + ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0); + if (IS_ERR(ctr_tfm)) + return PTR_ERR(ctr_tfm); + + crypto_ablkcipher_clear_flags(ctr_tfm, ~0); + + ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); + if (ret) { + crypto_free_ablkcipher(ctr_tfm); + return ret; + } + + req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); + if (!req) { + crypto_free_ablkcipher(ctr_tfm); + return -EINVAL; + } + + req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); + if (!req_data) { + crypto_free_ablkcipher(ctr_tfm); + return -ENOMEM; + } + memset(req_data->iv, 0, sizeof(req_data->iv)); + + /* Clear the data in the hash sub key container to zero.*/ + /* We want to cipher all zeros to create the hash sub key. */ + memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); + + init_completion(&req_data->result.completion); + sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE); + ablkcipher_request_set_tfm(req, ctr_tfm); + ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + rfc4106_set_hash_subkey_done, + &req_data->result); + + ablkcipher_request_set_crypt(req, &req_data->sg, + &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv); + + ret = crypto_ablkcipher_encrypt(req); + if (ret == -EINPROGRESS || ret == -EBUSY) { + ret = wait_for_completion_interruptible + (&req_data->result.completion); + if (!ret) + ret = req_data->result.err; + } + ablkcipher_request_free(req); + kfree(req_data); + crypto_free_ablkcipher(ctr_tfm); + return ret; +} + +static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, + unsigned int key_len) +{ + int ret = 0; + struct crypto_tfm *tfm = crypto_aead_tfm(parent); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + u8 *new_key_mem = NULL; + + if (key_len < 4) { + crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + /*Account for 4 byte nonce at the end.*/ + key_len -= 4; + if (key_len != AES_KEYSIZE_128) { + crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); + /*This must be on a 16 byte boundary!*/ + if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN) + return -EINVAL; + + if ((unsigned long)key % AESNI_ALIGN) { + /*key is not aligned: use an auxuliar aligned pointer*/ + new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL); + if (!new_key_mem) + return -ENOMEM; + + new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); + memcpy(new_key_mem, key, key_len); + key = new_key_mem; + } + + if (!irq_fpu_usable()) + ret = crypto_aes_expand_key(&(ctx->aes_key_expanded), + key, key_len); + else { + kernel_fpu_begin(); + ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len); + kernel_fpu_end(); + } + /*This must be on a 16 byte boundary!*/ + if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) { + ret = -EINVAL; + goto exit; + } + ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); +exit: + kfree(new_key_mem); + return ret; +} + +/* This is the Integrity Check Value (aka the authentication tag length and can + * be 8, 12 or 16 bytes long. */ +static int rfc4106_set_authsize(struct crypto_aead *parent, + unsigned int authsize) +{ + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + + switch (authsize) { + case 8: + case 12: + case 16: + break; + default: + return -EINVAL; + } + crypto_aead_crt(parent)->authsize = authsize; + crypto_aead_crt(cryptd_child)->authsize = authsize; + return 0; +} + +static int rfc4106_encrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + return crypto_aead_encrypt(cryptd_req); + } else { + kernel_fpu_begin(); + ret = cryptd_child->base.crt_aead.encrypt(req); + kernel_fpu_end(); + return ret; + } +} + +static int rfc4106_decrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + return crypto_aead_decrypt(cryptd_req); + } else { + kernel_fpu_begin(); + ret = cryptd_child->base.crt_aead.decrypt(req); + kernel_fpu_end(); + return ret; + } +} + +static struct crypto_alg rfc4106_alg = { + .cra_name = "rfc4106(gcm(aes))", + .cra_driver_name = "rfc4106-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, + .cra_alignmask = 0, + .cra_type = &crypto_nivaead_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list), + .cra_init = rfc4106_init, + .cra_exit = rfc4106_exit, + .cra_u = { + .aead = { + .setkey = rfc4106_set_key, + .setauthsize = rfc4106_set_authsize, + .encrypt = rfc4106_encrypt, + .decrypt = rfc4106_decrypt, + .geniv = "seqiv", + .ivsize = 8, + .maxauthsize = 16, + }, + }, +}; + +static int __driver_rfc4106_encrypt(struct aead_request *req) +{ + u8 one_entry_in_sg = 0; + u8 *src, *dst, *assoc; + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 iv_tab[16+AESNI_ALIGN]; + u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN); + struct scatter_walk src_sg_walk; + struct scatter_walk assoc_sg_walk; + struct scatter_walk dst_sg_walk; + unsigned int i; + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length equal */ + /* to 8 or 12 bytes */ + if (unlikely(req->assoclen != 8 && req->assoclen != 12)) + return -EINVAL; + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + one_entry_in_sg = 1; + scatterwalk_start(&src_sg_walk, req->src); + scatterwalk_start(&assoc_sg_walk, req->assoc); + src = scatterwalk_map(&src_sg_walk, 0); + assoc = scatterwalk_map(&assoc_sg_walk, 0); + dst = src; + if (unlikely(req->src != req->dst)) { + scatterwalk_start(&dst_sg_walk, req->dst); + dst = scatterwalk_map(&dst_sg_walk, 0); + } + + } else { + /* Allocate memory for src, dst, assoc */ + src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, + GFP_ATOMIC); + if (unlikely(!src)) + return -ENOMEM; + assoc = (src + req->cryptlen + auth_tag_len); + scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); + scatterwalk_map_and_copy(assoc, req->assoc, 0, + req->assoclen, 0); + dst = src; + } + + aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, + ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst + + ((unsigned long)req->cryptlen), auth_tag_len); + + /* The authTag (aka the Integrity Check Value) needs to be written + * back to the packet. */ + if (one_entry_in_sg) { + if (unlikely(req->src != req->dst)) { + scatterwalk_unmap(dst, 0); + scatterwalk_done(&dst_sg_walk, 0, 0); + } + scatterwalk_unmap(src, 0); + scatterwalk_unmap(assoc, 0); + scatterwalk_done(&src_sg_walk, 0, 0); + scatterwalk_done(&assoc_sg_walk, 0, 0); + } else { + scatterwalk_map_and_copy(dst, req->dst, 0, + req->cryptlen + auth_tag_len, 1); + kfree(src); + } + return 0; +} + +static int __driver_rfc4106_decrypt(struct aead_request *req) +{ + u8 one_entry_in_sg = 0; + u8 *src, *dst, *assoc; + unsigned long tempCipherLen = 0; + __be32 counter = cpu_to_be32(1); + int retval = 0; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + void *aes_ctx = &(ctx->aes_key_expanded); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 iv_and_authTag[32+AESNI_ALIGN]; + u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN); + u8 *authTag = iv + 16; + struct scatter_walk src_sg_walk; + struct scatter_walk assoc_sg_walk; + struct scatter_walk dst_sg_walk; + unsigned int i; + + if (unlikely((req->cryptlen < auth_tag_len) || + (req->assoclen != 8 && req->assoclen != 12))) + return -EINVAL; + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length */ + /* equal to 8 or 12 bytes */ + + tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + one_entry_in_sg = 1; + scatterwalk_start(&src_sg_walk, req->src); + scatterwalk_start(&assoc_sg_walk, req->assoc); + src = scatterwalk_map(&src_sg_walk, 0); + assoc = scatterwalk_map(&assoc_sg_walk, 0); + dst = src; + if (unlikely(req->src != req->dst)) { + scatterwalk_start(&dst_sg_walk, req->dst); + dst = scatterwalk_map(&dst_sg_walk, 0); + } + + } else { + /* Allocate memory for src, dst, assoc */ + src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); + if (!src) + return -ENOMEM; + assoc = (src + req->cryptlen + auth_tag_len); + scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); + scatterwalk_map_and_copy(assoc, req->assoc, 0, + req->assoclen, 0); + dst = src; + } + + aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, + ctx->hash_subkey, assoc, (unsigned long)req->assoclen, + authTag, auth_tag_len); + + /* Compare generated tag with passed in tag. */ + retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? + -EBADMSG : 0; + + if (one_entry_in_sg) { + if (unlikely(req->src != req->dst)) { + scatterwalk_unmap(dst, 0); + scatterwalk_done(&dst_sg_walk, 0, 0); + } + scatterwalk_unmap(src, 0); + scatterwalk_unmap(assoc, 0); + scatterwalk_done(&src_sg_walk, 0, 0); + scatterwalk_done(&assoc_sg_walk, 0, 0); + } else { + scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1); + kfree(src); + } + return retval; +} + +static struct crypto_alg __rfc4106_alg = { + .cra_name = "__gcm-aes-aesni", + .cra_driver_name = "__driver-gcm-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_AEAD, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, + .cra_alignmask = 0, + .cra_type = &crypto_aead_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list), + .cra_u = { + .aead = { + .encrypt = __driver_rfc4106_encrypt, + .decrypt = __driver_rfc4106_decrypt, + }, + }, +}; + static int __init aesni_init(void) { int err; @@ -738,6 +1240,7 @@ static int __init aesni_init(void) printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); return -ENODEV; } + if ((err = crypto_register_alg(&aesni_alg))) goto aes_err; if ((err = crypto_register_alg(&__aesni_alg))) @@ -770,10 +1273,19 @@ static int __init aesni_init(void) if ((err = crypto_register_alg(&ablk_xts_alg))) goto ablk_xts_err; #endif - + err = crypto_register_alg(&__rfc4106_alg); + if (err) + goto __aead_gcm_err; + err = crypto_register_alg(&rfc4106_alg); + if (err) + goto aead_gcm_err; return err; +aead_gcm_err: + crypto_unregister_alg(&__rfc4106_alg); +__aead_gcm_err: #ifdef HAS_XTS + crypto_unregister_alg(&ablk_xts_alg); ablk_xts_err: #endif #ifdef HAS_PCBC @@ -809,6 +1321,8 @@ aes_err: static void __exit aesni_exit(void) { + crypto_unregister_alg(&__rfc4106_alg); + crypto_unregister_alg(&rfc4106_alg); #ifdef HAS_XTS crypto_unregister_alg(&ablk_xts_alg); #endif -- cgit v1.2.3 From 0d258efb6a58fe047197c3b9cff8746bb176d58a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sat, 27 Nov 2010 16:34:46 +0800 Subject: crypto: aesni-intel - Ported implementation to x86-32 The AES-NI instructions are also available in legacy mode so the 32-bit architecture may profit from those, too. To illustrate the performance gain here's a short summary of a dm-crypt speed test on a Core i7 M620 running at 2.67GHz comparing both assembler implementations: x86: i568 aes-ni delta ECB, 256 bit: 93.8 MB/s 123.3 MB/s +31.4% CBC, 256 bit: 84.8 MB/s 262.3 MB/s +209.3% LRW, 256 bit: 108.6 MB/s 222.1 MB/s +104.5% XTS, 256 bit: 105.0 MB/s 205.5 MB/s +95.7% Additionally, due to some minor optimizations, the 64-bit version also got a minor performance gain as seen below: x86-64: old impl. new impl. delta ECB, 256 bit: 121.1 MB/s 123.0 MB/s +1.5% CBC, 256 bit: 285.3 MB/s 290.8 MB/s +1.9% LRW, 256 bit: 263.7 MB/s 265.3 MB/s +0.6% XTS, 256 bit: 251.1 MB/s 255.3 MB/s +1.7% Signed-off-by: Mathias Krause Reviewed-by: Huang Ying Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 197 +++++++++++++++++++++++++++++++------ arch/x86/crypto/aesni-intel_glue.c | 22 ++++- crypto/Kconfig | 12 ++- 3 files changed, 191 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index aafced54df6..f592e03dc37 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -20,6 +20,9 @@ * Wajdi Feghali (wajdi.k.feghali@intel.com) * Copyright (c) 2010, Intel Corporation. * + * Ported x86_64 version to x86: + * Author: Mathias Krause + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -95,12 +98,16 @@ enc: .octa 0x2 #define IN IN1 #define KEY %xmm2 #define IV %xmm3 + #define BSWAP_MASK %xmm10 #define CTR %xmm11 #define INC %xmm12 +#ifdef __x86_64__ +#define AREG %rax #define KEYP %rdi #define OUTP %rsi +#define UKEYP OUTP #define INP %rdx #define LEN %rcx #define IVP %r8 @@ -109,6 +116,18 @@ enc: .octa 0x2 #define TKEYP T1 #define T2 %r11 #define TCTR_LOW T2 +#else +#define AREG %eax +#define KEYP %edi +#define OUTP AREG +#define UKEYP OUTP +#define INP %edx +#define LEN %esi +#define IVP %ebp +#define KLEN %ebx +#define T1 %ecx +#define TKEYP T1 +#endif /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) @@ -1247,10 +1266,11 @@ _key_expansion_256a: shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 - movaps %xmm0, (%rcx) - add $0x10, %rcx + movaps %xmm0, (TKEYP) + add $0x10, TKEYP ret +.align 4 _key_expansion_192a: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -1268,12 +1288,13 @@ _key_expansion_192a: movaps %xmm0, %xmm1 shufps $0b01000100, %xmm0, %xmm6 - movaps %xmm6, (%rcx) + movaps %xmm6, (TKEYP) shufps $0b01001110, %xmm2, %xmm1 - movaps %xmm1, 16(%rcx) - add $0x20, %rcx + movaps %xmm1, 0x10(TKEYP) + add $0x20, TKEYP ret +.align 4 _key_expansion_192b: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -1288,10 +1309,11 @@ _key_expansion_192b: pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 - movaps %xmm0, (%rcx) - add $0x10, %rcx + movaps %xmm0, (TKEYP) + add $0x10, TKEYP ret +.align 4 _key_expansion_256b: pshufd $0b10101010, %xmm1, %xmm1 shufps $0b00010000, %xmm2, %xmm4 @@ -1299,8 +1321,8 @@ _key_expansion_256b: shufps $0b10001100, %xmm2, %xmm4 pxor %xmm4, %xmm2 pxor %xmm1, %xmm2 - movaps %xmm2, (%rcx) - add $0x10, %rcx + movaps %xmm2, (TKEYP) + add $0x10, TKEYP ret /* @@ -1308,17 +1330,23 @@ _key_expansion_256b: * unsigned int key_len) */ ENTRY(aesni_set_key) - movups (%rsi), %xmm0 # user key (first 16 bytes) - movaps %xmm0, (%rdi) - lea 0x10(%rdi), %rcx # key addr - movl %edx, 480(%rdi) +#ifndef __x86_64__ + pushl KEYP + movl 8(%esp), KEYP # ctx + movl 12(%esp), UKEYP # in_key + movl 16(%esp), %edx # key_len +#endif + movups (UKEYP), %xmm0 # user key (first 16 bytes) + movaps %xmm0, (KEYP) + lea 0x10(KEYP), TKEYP # key addr + movl %edx, 480(KEYP) pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x cmp $24, %dl jb .Lenc_key128 je .Lenc_key192 - movups 0x10(%rsi), %xmm2 # other user key - movaps %xmm2, (%rcx) - add $0x10, %rcx + movups 0x10(UKEYP), %xmm2 # other user key + movaps %xmm2, (TKEYP) + add $0x10, TKEYP AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_256a AESKEYGENASSIST 0x1 %xmm0 %xmm1 @@ -1347,7 +1375,7 @@ ENTRY(aesni_set_key) call _key_expansion_256a jmp .Ldec_key .Lenc_key192: - movq 0x10(%rsi), %xmm2 # other user key + movq 0x10(UKEYP), %xmm2 # other user key AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_192a AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 @@ -1387,33 +1415,47 @@ ENTRY(aesni_set_key) AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 call _key_expansion_128 .Ldec_key: - sub $0x10, %rcx - movaps (%rdi), %xmm0 - movaps (%rcx), %xmm1 - movaps %xmm0, 240(%rcx) - movaps %xmm1, 240(%rdi) - add $0x10, %rdi - lea 240-16(%rcx), %rsi + sub $0x10, TKEYP + movaps (KEYP), %xmm0 + movaps (TKEYP), %xmm1 + movaps %xmm0, 240(TKEYP) + movaps %xmm1, 240(KEYP) + add $0x10, KEYP + lea 240-16(TKEYP), UKEYP .align 4 .Ldec_key_loop: - movaps (%rdi), %xmm0 + movaps (KEYP), %xmm0 AESIMC %xmm0 %xmm1 - movaps %xmm1, (%rsi) - add $0x10, %rdi - sub $0x10, %rsi - cmp %rcx, %rdi + movaps %xmm1, (UKEYP) + add $0x10, KEYP + sub $0x10, UKEYP + cmp TKEYP, KEYP jb .Ldec_key_loop - xor %rax, %rax + xor AREG, AREG +#ifndef __x86_64__ + popl KEYP +#endif ret /* * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_enc) +#ifndef __x86_64__ + pushl KEYP + pushl KLEN + movl 12(%esp), KEYP + movl 16(%esp), OUTP + movl 20(%esp), INP +#endif movl 480(KEYP), KLEN # key length movups (INP), STATE # input call _aesni_enc1 movups STATE, (OUTP) # output +#ifndef __x86_64__ + popl KLEN + popl KEYP +#endif ret /* @@ -1428,6 +1470,7 @@ ENTRY(aesni_enc) * KEY * TKEYP (T1) */ +.align 4 _aesni_enc1: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -1490,6 +1533,7 @@ _aesni_enc1: * KEY * TKEYP (T1) */ +.align 4 _aesni_enc4: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -1583,11 +1627,22 @@ _aesni_enc4: * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_dec) +#ifndef __x86_64__ + pushl KEYP + pushl KLEN + movl 12(%esp), KEYP + movl 16(%esp), OUTP + movl 20(%esp), INP +#endif mov 480(KEYP), KLEN # key length add $240, KEYP movups (INP), STATE # input call _aesni_dec1 movups STATE, (OUTP) #output +#ifndef __x86_64__ + popl KLEN + popl KEYP +#endif ret /* @@ -1602,6 +1657,7 @@ ENTRY(aesni_dec) * KEY * TKEYP (T1) */ +.align 4 _aesni_dec1: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -1664,6 +1720,7 @@ _aesni_dec1: * KEY * TKEYP (T1) */ +.align 4 _aesni_dec4: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -1758,6 +1815,15 @@ _aesni_dec4: * size_t len) */ ENTRY(aesni_ecb_enc) +#ifndef __x86_64__ + pushl LEN + pushl KEYP + pushl KLEN + movl 16(%esp), KEYP + movl 20(%esp), OUTP + movl 24(%esp), INP + movl 28(%esp), LEN +#endif test LEN, LEN # check length jz .Lecb_enc_ret mov 480(KEYP), KLEN @@ -1794,6 +1860,11 @@ ENTRY(aesni_ecb_enc) cmp $16, LEN jge .Lecb_enc_loop1 .Lecb_enc_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN +#endif ret /* @@ -1801,6 +1872,15 @@ ENTRY(aesni_ecb_enc) * size_t len); */ ENTRY(aesni_ecb_dec) +#ifndef __x86_64__ + pushl LEN + pushl KEYP + pushl KLEN + movl 16(%esp), KEYP + movl 20(%esp), OUTP + movl 24(%esp), INP + movl 28(%esp), LEN +#endif test LEN, LEN jz .Lecb_dec_ret mov 480(KEYP), KLEN @@ -1838,6 +1918,11 @@ ENTRY(aesni_ecb_dec) cmp $16, LEN jge .Lecb_dec_loop1 .Lecb_dec_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN +#endif ret /* @@ -1845,6 +1930,17 @@ ENTRY(aesni_ecb_dec) * size_t len, u8 *iv) */ ENTRY(aesni_cbc_enc) +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl 20(%esp), KEYP + movl 24(%esp), OUTP + movl 28(%esp), INP + movl 32(%esp), LEN + movl 36(%esp), IVP +#endif cmp $16, LEN jb .Lcbc_enc_ret mov 480(KEYP), KLEN @@ -1862,6 +1958,12 @@ ENTRY(aesni_cbc_enc) jge .Lcbc_enc_loop movups STATE, (IVP) .Lcbc_enc_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif ret /* @@ -1869,6 +1971,17 @@ ENTRY(aesni_cbc_enc) * size_t len, u8 *iv) */ ENTRY(aesni_cbc_dec) +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl 20(%esp), KEYP + movl 24(%esp), OUTP + movl 28(%esp), INP + movl 32(%esp), LEN + movl 36(%esp), IVP +#endif cmp $16, LEN jb .Lcbc_dec_just_ret mov 480(KEYP), KLEN @@ -1882,16 +1995,30 @@ ENTRY(aesni_cbc_dec) movaps IN1, STATE1 movups 0x10(INP), IN2 movaps IN2, STATE2 +#ifdef __x86_64__ movups 0x20(INP), IN3 movaps IN3, STATE3 movups 0x30(INP), IN4 movaps IN4, STATE4 +#else + movups 0x20(INP), IN1 + movaps IN1, STATE3 + movups 0x30(INP), IN2 + movaps IN2, STATE4 +#endif call _aesni_dec4 pxor IV, STATE1 +#ifdef __x86_64__ pxor IN1, STATE2 pxor IN2, STATE3 pxor IN3, STATE4 movaps IN4, IV +#else + pxor (INP), STATE2 + pxor 0x10(INP), STATE3 + pxor IN1, STATE4 + movaps IN2, IV +#endif movups STATE1, (OUTP) movups STATE2, 0x10(OUTP) movups STATE3, 0x20(OUTP) @@ -1919,8 +2046,15 @@ ENTRY(aesni_cbc_dec) .Lcbc_dec_ret: movups IV, (IVP) .Lcbc_dec_just_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif ret +#ifdef __x86_64__ .align 16 .Lbswap_mask: .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 @@ -1936,6 +2070,7 @@ ENTRY(aesni_cbc_dec) * INC: == 1, in little endian * BSWAP_MASK == endian swapping mask */ +.align 4 _aesni_inc_init: movaps .Lbswap_mask, BSWAP_MASK movaps IV, CTR @@ -1960,6 +2095,7 @@ _aesni_inc_init: * CTR: == output IV, in little endian * TCTR_LOW: == lower qword of CTR */ +.align 4 _aesni_inc: paddq INC, CTR add $1, TCTR_LOW @@ -2031,3 +2167,4 @@ ENTRY(aesni_ctr_enc) movups IV, (IVP) .Lctr_enc_just_ret: ret +#endif diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 02d349d6442..8a3b8007521 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -94,8 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +#ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +#endif /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. @@ -410,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = { }, }; +#ifdef CONFIG_X86_64 static void ctr_crypt_final(struct crypto_aes_ctx *ctx, struct blkcipher_walk *walk) { @@ -475,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = { }, }, }; +#endif static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) @@ -622,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = { }, }; +#ifdef CONFIG_X86_64 static int ablk_ctr_init(struct crypto_tfm *tfm) { struct cryptd_ablkcipher *cryptd_tfm; @@ -698,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = { }, }; #endif +#endif #ifdef HAS_LRW static int ablk_lrw_init(struct crypto_tfm *tfm) @@ -1249,18 +1255,20 @@ static int __init aesni_init(void) goto blk_ecb_err; if ((err = crypto_register_alg(&blk_cbc_alg))) goto blk_cbc_err; - if ((err = crypto_register_alg(&blk_ctr_alg))) - goto blk_ctr_err; if ((err = crypto_register_alg(&ablk_ecb_alg))) goto ablk_ecb_err; if ((err = crypto_register_alg(&ablk_cbc_alg))) goto ablk_cbc_err; +#ifdef CONFIG_X86_64 + if ((err = crypto_register_alg(&blk_ctr_alg))) + goto blk_ctr_err; if ((err = crypto_register_alg(&ablk_ctr_alg))) goto ablk_ctr_err; #ifdef HAS_CTR if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) goto ablk_rfc3686_ctr_err; #endif +#endif #ifdef HAS_LRW if ((err = crypto_register_alg(&ablk_lrw_alg))) goto ablk_lrw_err; @@ -1296,18 +1304,20 @@ ablk_pcbc_err: crypto_unregister_alg(&ablk_lrw_alg); ablk_lrw_err: #endif +#ifdef CONFIG_X86_64 #ifdef HAS_CTR crypto_unregister_alg(&ablk_rfc3686_ctr_alg); ablk_rfc3686_ctr_err: #endif crypto_unregister_alg(&ablk_ctr_alg); ablk_ctr_err: + crypto_unregister_alg(&blk_ctr_alg); +blk_ctr_err: +#endif crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: crypto_unregister_alg(&ablk_ecb_alg); ablk_ecb_err: - crypto_unregister_alg(&blk_ctr_alg); -blk_ctr_err: crypto_unregister_alg(&blk_cbc_alg); blk_cbc_err: crypto_unregister_alg(&blk_ecb_alg); @@ -1332,13 +1342,15 @@ static void __exit aesni_exit(void) #ifdef HAS_LRW crypto_unregister_alg(&ablk_lrw_alg); #endif +#ifdef CONFIG_X86_64 #ifdef HAS_CTR crypto_unregister_alg(&ablk_rfc3686_ctr_alg); #endif crypto_unregister_alg(&ablk_ctr_alg); + crypto_unregister_alg(&blk_ctr_alg); +#endif crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); - crypto_unregister_alg(&blk_ctr_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); crypto_unregister_alg(&__aesni_alg); diff --git a/crypto/Kconfig b/crypto/Kconfig index 69437e21217..467491df3e3 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -539,8 +539,9 @@ config CRYPTO_AES_X86_64 config CRYPTO_AES_NI_INTEL tristate "AES cipher algorithms (AES-NI)" - depends on (X86 || UML_X86) && 64BIT - select CRYPTO_AES_X86_64 + depends on (X86 || UML_X86) + select CRYPTO_AES_X86_64 if 64BIT + select CRYPTO_AES_586 if !64BIT select CRYPTO_CRYPTD select CRYPTO_ALGAPI select CRYPTO_FPU @@ -563,9 +564,10 @@ config CRYPTO_AES_NI_INTEL See for more information. - In addition to AES cipher algorithm support, the - acceleration for some popular block cipher mode is supported - too, including ECB, CBC, CTR, LRW, PCBC, XTS. + In addition to AES cipher algorithm support, the acceleration + for some popular block cipher mode is supported too, including + ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional + acceleration for CTR. config CRYPTO_ANUBIS tristate "Anubis cipher algorithm" -- cgit v1.2.3 From 559ad0ff1368baea14dbc3207d55b02bd69bda4b Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 29 Nov 2010 08:35:39 +0800 Subject: crypto: aesni-intel - Fixed build error on x86-32 Exclude AES-GCM code for x86-32 due to heavy usage of 64-bit registers not available on x86-32. While at it, fixed unregister order in aesni_exit(). Signed-off-by: Mathias Krause Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 5 ++++- arch/x86/crypto/aesni-intel_glue.c | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index f592e03dc37..d528fde219d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,7 @@ #include #include +#ifdef __x86_64__ .data POLY: .octa 0xC2000000000000000000000000000001 TWOONE: .octa 0x00000001000000000000000000000001 @@ -84,6 +85,7 @@ enc: .octa 0x2 #define arg8 STACK_OFFSET+16(%r14) #define arg9 STACK_OFFSET+24(%r14) #define arg10 STACK_OFFSET+32(%r14) +#endif #define STATE1 %xmm0 @@ -130,6 +132,7 @@ enc: .octa 0x2 #endif +#ifdef __x86_64__ /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) * * @@ -1255,7 +1258,7 @@ _return_T_done_encrypt: pop %r13 pop %r12 ret - +#endif _key_expansion_128: diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 8a3b8007521..e1e60c7d581 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -97,7 +97,6 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); -#endif /* asmlinkage void aesni_gcm_enc() * void *ctx, AES Key schedule. Starts on a 16 byte boundary. @@ -149,6 +148,7 @@ aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) PTR_ALIGN((u8 *) crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN); } +#endif static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { @@ -822,6 +822,7 @@ static struct crypto_alg ablk_xts_alg = { }; #endif +#ifdef CONFIG_X86_64 static int rfc4106_init(struct crypto_tfm *tfm) { struct cryptd_aead *cryptd_tfm; @@ -1237,6 +1238,7 @@ static struct crypto_alg __rfc4106_alg = { }, }, }; +#endif static int __init aesni_init(void) { @@ -1264,6 +1266,10 @@ static int __init aesni_init(void) goto blk_ctr_err; if ((err = crypto_register_alg(&ablk_ctr_alg))) goto ablk_ctr_err; + if ((err = crypto_register_alg(&__rfc4106_alg))) + goto __aead_gcm_err; + if ((err = crypto_register_alg(&rfc4106_alg))) + goto aead_gcm_err; #ifdef HAS_CTR if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) goto ablk_rfc3686_ctr_err; @@ -1281,19 +1287,9 @@ static int __init aesni_init(void) if ((err = crypto_register_alg(&ablk_xts_alg))) goto ablk_xts_err; #endif - err = crypto_register_alg(&__rfc4106_alg); - if (err) - goto __aead_gcm_err; - err = crypto_register_alg(&rfc4106_alg); - if (err) - goto aead_gcm_err; return err; -aead_gcm_err: - crypto_unregister_alg(&__rfc4106_alg); -__aead_gcm_err: #ifdef HAS_XTS - crypto_unregister_alg(&ablk_xts_alg); ablk_xts_err: #endif #ifdef HAS_PCBC @@ -1309,6 +1305,10 @@ ablk_lrw_err: crypto_unregister_alg(&ablk_rfc3686_ctr_alg); ablk_rfc3686_ctr_err: #endif + crypto_unregister_alg(&rfc4106_alg); +aead_gcm_err: + crypto_unregister_alg(&__rfc4106_alg); +__aead_gcm_err: crypto_unregister_alg(&ablk_ctr_alg); ablk_ctr_err: crypto_unregister_alg(&blk_ctr_alg); @@ -1331,8 +1331,6 @@ aes_err: static void __exit aesni_exit(void) { - crypto_unregister_alg(&__rfc4106_alg); - crypto_unregister_alg(&rfc4106_alg); #ifdef HAS_XTS crypto_unregister_alg(&ablk_xts_alg); #endif @@ -1346,6 +1344,8 @@ static void __exit aesni_exit(void) #ifdef HAS_CTR crypto_unregister_alg(&ablk_rfc3686_ctr_alg); #endif + crypto_unregister_alg(&rfc4106_alg); + crypto_unregister_alg(&__rfc4106_alg); crypto_unregister_alg(&ablk_ctr_alg); crypto_unregister_alg(&blk_ctr_alg); #endif -- cgit v1.2.3 From 3c097b800816c0e4c2a34c38f8b2409427770f7a Mon Sep 17 00:00:00 2001 From: Tadeusz Struk Date: Mon, 13 Dec 2010 19:51:15 +0800 Subject: crypto: aesni-intel - Fixed build with binutils 2.16 This patch fixes the problem with 2.16 binutils. Signed-off-by: Aidan O'Mahony Signed-off-by: Adrian Hoban Signed-off-by: Gabriele Paoloni Signed-off-by: Tadeusz Struk Signed-off-by: Herbert Xu --- arch/x86/crypto/aesni-intel_asm.S | 598 +++++++++++++++++++++++++++++++++----- 1 file changed, 519 insertions(+), 79 deletions(-) (limited to 'arch') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index d528fde219d..8fe2a4966b7 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -204,9 +204,9 @@ enc: .octa 0x2 * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified */ -.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ -XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation +.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ +XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation mov arg7, %r10 # %r10 = AAD mov arg8, %r12 # %r12 = aadLen mov %r12, %r11 @@ -228,19 +228,25 @@ _get_AAD_loop2\num_initial_blocks\operation: cmp %r11, %r12 jne _get_AAD_loop2\num_initial_blocks\operation _get_AAD_loop2_done\num_initial_blocks\operation: - pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + xor %r11, %r11 # initialise the data pointer offset as zero # start AES for num_initial_blocks blocks mov %arg5, %rax # %rax = *Y0 movdqu (%rax), \XMM0 # XMM0 = Y0 - pshufb SHUF_MASK(%rip), \XMM0 -.if \i_seq != 0 + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM0 + +.if (\i == 5) || (\i == 6) || (\i == 7) .irpc index, \i_seq paddd ONE(%rip), \XMM0 # INCR Y0 movdqa \XMM0, %xmm\index - pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + .endr .irpc index, \i_seq pxor 16*0(%arg1), %xmm\index @@ -291,10 +297,11 @@ _get_AAD_loop2_done\num_initial_blocks\operation: movdqu %xmm\index, (%arg2 , %r11, 1) # write back plaintext/ciphertext for num_initial_blocks add $16, %r11 -.if \operation == dec + movdqa \TMP1, %xmm\index -.endif - pshufb SHUF_MASK(%rip), %xmm\index + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, %xmm\index + # prepare plaintext/ciphertext for GHASH computation .endr .endif @@ -327,16 +334,24 @@ _get_AAD_loop2_done\num_initial_blocks\operation: */ paddd ONE(%rip), \XMM0 # INCR Y0 movdqa \XMM0, \XMM1 - pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + paddd ONE(%rip), \XMM0 # INCR Y0 movdqa \XMM0, \XMM2 - pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + paddd ONE(%rip), \XMM0 # INCR Y0 movdqa \XMM0, \XMM3 - pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + paddd ONE(%rip), \XMM0 # INCR Y0 movdqa \XMM0, \XMM4 - pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + pxor 16*0(%arg1), \XMM1 pxor 16*0(%arg1), \XMM2 pxor 16*0(%arg1), \XMM3 @@ -385,41 +400,268 @@ _get_AAD_loop2_done\num_initial_blocks\operation: AESENCLAST \TMP2, \XMM4 movdqu 16*0(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM1 -.if \operation == dec movdqu \XMM1, 16*0(%arg2 , %r11 , 1) movdqa \TMP1, \XMM1 -.endif movdqu 16*1(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM2 -.if \operation == dec movdqu \XMM2, 16*1(%arg2 , %r11 , 1) movdqa \TMP1, \XMM2 -.endif movdqu 16*2(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM3 -.if \operation == dec movdqu \XMM3, 16*2(%arg2 , %r11 , 1) movdqa \TMP1, \XMM3 -.endif movdqu 16*3(%arg3 , %r11 , 1), \TMP1 pxor \TMP1, \XMM4 -.if \operation == dec movdqu \XMM4, 16*3(%arg2 , %r11 , 1) movdqa \TMP1, \XMM4 -.else + add $64, %r11 + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pxor \XMMDst, \XMM1 +# combine GHASHed value with the corresponding ciphertext + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + +_initial_blocks_done\num_initial_blocks\operation: + +.endm + + +/* +* if a = number of total plaintext bytes +* b = floor(a/16) +* num_initial_blocks = b mod 4 +* encrypt the initial num_initial_blocks blocks and apply ghash on +* the ciphertext +* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers +* are clobbered +* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified +*/ + + +.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ +XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation + mov arg7, %r10 # %r10 = AAD + mov arg8, %r12 # %r12 = aadLen + mov %r12, %r11 + pxor %xmm\i, %xmm\i +_get_AAD_loop\num_initial_blocks\operation: + movd (%r10), \TMP1 + pslldq $12, \TMP1 + psrldq $4, %xmm\i + pxor \TMP1, %xmm\i + add $4, %r10 + sub $4, %r12 + jne _get_AAD_loop\num_initial_blocks\operation + cmp $16, %r11 + je _get_AAD_loop2_done\num_initial_blocks\operation + mov $16, %r12 +_get_AAD_loop2\num_initial_blocks\operation: + psrldq $4, %xmm\i + sub $4, %r12 + cmp %r11, %r12 + jne _get_AAD_loop2\num_initial_blocks\operation +_get_AAD_loop2_done\num_initial_blocks\operation: + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + + xor %r11, %r11 # initialise the data pointer offset as zero + + # start AES for num_initial_blocks blocks + + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), \XMM0 # XMM0 = Y0 + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM0 + +.if (\i == 5) || (\i == 6) || (\i == 7) +.irpc index, \i_seq + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, %xmm\index + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + +.endr +.irpc index, \i_seq + pxor 16*0(%arg1), %xmm\index +.endr +.irpc index, \i_seq + movaps 0x10(%rdi), \TMP1 + AESENC \TMP1, %xmm\index # Round 1 +.endr +.irpc index, \i_seq + movaps 0x20(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x30(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x40(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x50(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x60(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x70(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x80(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0x90(%arg1), \TMP1 + AESENC \TMP1, %xmm\index # Round 2 +.endr +.irpc index, \i_seq + movaps 0xa0(%arg1), \TMP1 + AESENCLAST \TMP1, %xmm\index # Round 10 +.endr +.irpc index, \i_seq + movdqu (%arg3 , %r11, 1), \TMP1 + pxor \TMP1, %xmm\index + movdqu %xmm\index, (%arg2 , %r11, 1) + # write back plaintext/ciphertext for num_initial_blocks + add $16, %r11 + + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, %xmm\index + + # prepare plaintext/ciphertext for GHASH computation +.endr +.endif + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks + +.if \i == 5 + pxor %xmm5, %xmm6 + GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 6 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 7 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.endif + cmp $64, %r13 + jl _initial_blocks_done\num_initial_blocks\operation + # no need for precomputed values +/* +* +* Precomputations for HashKey parallel with encryption of first 4 blocks. +* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +*/ + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM1 + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM2 + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM3 + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + + paddd ONE(%rip), \XMM0 # INCR Y0 + movdqa \XMM0, \XMM4 + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + + pxor 16*0(%arg1), \XMM1 + pxor 16*0(%arg1), \XMM2 + pxor 16*0(%arg1), \XMM3 + pxor 16*0(%arg1), \XMM4 + movdqa \TMP3, \TMP5 + pshufd $78, \TMP3, \TMP1 + pxor \TMP3, \TMP1 + movdqa \TMP1, HashKey_k(%rsp) + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^2<<1 (mod poly) + movdqa \TMP5, HashKey_2(%rsp) +# HashKey_2 = HashKey^2<<1 (mod poly) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_2_k(%rsp) +.irpc index, 1234 # do 4 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_3(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_3_k(%rsp) +.irpc index, 56789 # do next 5 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_4(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_4_k(%rsp) + movaps 0xa0(%arg1), \TMP2 + AESENCLAST \TMP2, \XMM1 + AESENCLAST \TMP2, \XMM2 + AESENCLAST \TMP2, \XMM3 + AESENCLAST \TMP2, \XMM4 + movdqu 16*0(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM1 + movdqu 16*1(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM2 + movdqu 16*2(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM3 + movdqu 16*3(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM4 movdqu \XMM1, 16*0(%arg2 , %r11 , 1) movdqu \XMM2, 16*1(%arg2 , %r11 , 1) movdqu \XMM3, 16*2(%arg2 , %r11 , 1) movdqu \XMM4, 16*3(%arg2 , %r11 , 1) -.endif + add $64, %r11 - pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap pxor \XMMDst, \XMM1 # combine GHASHed value with the corresponding ciphertext - pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + movdqa SHUF_MASK(%rip), %xmm14 + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + _initial_blocks_done\num_initial_blocks\operation: + .endm /* @@ -428,7 +670,199 @@ _initial_blocks_done\num_initial_blocks\operation: * arg1, %arg2, %arg3 are used as pointers only, not modified * %r11 is the data offset value */ -.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \ +.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ +TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation + + movdqa \XMM1, \XMM5 + movdqa \XMM2, \XMM6 + movdqa \XMM3, \XMM7 + movdqa \XMM4, \XMM8 + + movdqa SHUF_MASK(%rip), %xmm15 + # multiply TMP5 * HashKey using karatsuba + + movdqa \XMM5, \TMP4 + pshufd $78, \XMM5, \TMP6 + pxor \XMM5, \TMP6 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa HashKey_4(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + movdqa \XMM0, \XMM1 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM2 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM3 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM4 + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap + PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + + pxor (%arg1), \XMM1 + pxor (%arg1), \XMM2 + pxor (%arg1), \XMM3 + pxor (%arg1), \XMM4 + movdqa HashKey_4_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + movaps 0x10(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movaps 0x20(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 2 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movdqa \XMM6, \TMP1 + pshufd $78, \XMM6, \TMP2 + pxor \XMM6, \TMP2 + movdqa HashKey_3(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + movaps 0x30(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 3 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + movaps 0x40(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 4 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_3_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x50(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 5 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM6, \XMM5 + pxor \TMP2, \TMP6 + movdqa \XMM7, \TMP1 + pshufd $78, \XMM7, \TMP2 + pxor \XMM7, \TMP2 + movdqa HashKey_2(%rsp ), \TMP5 + + # Multiply TMP5 * HashKey using karatsuba + + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x60(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 6 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + movaps 0x70(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 7 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_2_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x80(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 8 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM7, \XMM5 + pxor \TMP2, \TMP6 + + # Multiply XMM8 * HashKey + # XMM8 and TMP5 hold the values for the two operands + + movdqa \XMM8, \TMP1 + pshufd $78, \XMM8, \TMP2 + pxor \XMM8, \TMP2 + movdqa HashKey(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x90(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 9 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + movaps 0xa0(%arg1), \TMP3 + AESENCLAST \TMP3, \XMM1 # Round 10 + AESENCLAST \TMP3, \XMM2 + AESENCLAST \TMP3, \XMM3 + AESENCLAST \TMP3, \XMM4 + movdqa HashKey_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movdqu (%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK + movdqu 16(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK + movdqu 32(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK + movdqu 48(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK + movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + + pxor \TMP4, \TMP1 + pxor \XMM8, \XMM5 + pxor \TMP6, \TMP2 + pxor \TMP1, \TMP2 + pxor \XMM5, \TMP2 + movdqa \TMP2, \TMP3 + pslldq $8, \TMP3 # left shift TMP3 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP3, \XMM5 + pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 + + # first phase of reduction + + movdqa \XMM5, \TMP2 + movdqa \XMM5, \TMP3 + movdqa \XMM5, \TMP4 +# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently + pslld $31, \TMP2 # packed right shift << 31 + pslld $30, \TMP3 # packed right shift << 30 + pslld $25, \TMP4 # packed right shift << 25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP5 + psrldq $4, \TMP5 # right shift T5 1 DW + pslldq $12, \TMP2 # left shift T2 3 DWs + pxor \TMP2, \XMM5 + + # second phase of reduction + + movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 + movdqa \XMM5,\TMP3 + movdqa \XMM5,\TMP4 + psrld $1, \TMP2 # packed left shift >>1 + psrld $2, \TMP3 # packed left shift >>2 + psrld $7, \TMP4 # packed left shift >>7 + pxor \TMP3,\TMP2 # xor the shifted versions + pxor \TMP4,\TMP2 + pxor \TMP5, \TMP2 + pxor \TMP2, \XMM5 + pxor \TMP1, \XMM5 # result is in TMP1 + + pxor \XMM5, \XMM1 +.endm + +/* +* decrypt 4 blocks at a time +* ghash the 4 previously decrypted ciphertext blocks +* arg1, %arg2, %arg3 are used as pointers only, not modified +* %r11 is the data offset value +*/ +.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM1, \XMM5 @@ -436,6 +870,7 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM3, \XMM7 movdqa \XMM4, \XMM8 + movdqa SHUF_MASK(%rip), %xmm15 # multiply TMP5 * HashKey using karatsuba movdqa \XMM5, \TMP4 @@ -451,11 +886,12 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation movdqa \XMM0, \XMM3 paddd ONE(%rip), \XMM0 # INCR CNT movdqa \XMM0, \XMM4 - pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 - pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + pxor (%arg1), \XMM1 pxor (%arg1), \XMM2 pxor (%arg1), \XMM3 @@ -553,37 +989,24 @@ TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) movdqu (%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK -.if \operation == dec movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM1 -.endif movdqu 16(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK -.if \operation == dec movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM2 -.endif movdqu 32(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK -.if \operation == dec movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM3 -.endif movdqu 48(%arg3,%r11,1), \TMP3 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK -.if \operation == dec movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer movdqa \TMP3, \XMM4 -.else - movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer - movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer - movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer - movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer -.endif - pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap pxor \TMP4, \TMP1 pxor \XMM8, \XMM5 @@ -853,7 +1276,9 @@ ENTRY(aesni_gcm_dec) and $~63, %rsp # align rsp to 64 bytes mov %arg6, %r12 movdqu (%r12), %xmm13 # %xmm13 = HashKey - pshufb SHUF_MASK(%rip), %xmm13 + movdqa SHUF_MASK(%rip), %xmm2 + PSHUFB_XMM %xmm2, %xmm13 + # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) @@ -885,22 +1310,22 @@ ENTRY(aesni_gcm_dec) jb _initial_num_blocks_is_1_decrypt je _initial_num_blocks_is_2_decrypt _initial_num_blocks_is_3_decrypt: - INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec sub $48, %r13 jmp _initial_blocks_decrypted _initial_num_blocks_is_2_decrypt: - INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec sub $32, %r13 jmp _initial_blocks_decrypted _initial_num_blocks_is_1_decrypt: - INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec sub $16, %r13 jmp _initial_blocks_decrypted _initial_num_blocks_is_0_decrypt: - INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec _initial_blocks_decrypted: cmp $0, %r13 @@ -908,7 +1333,7 @@ _initial_blocks_decrypted: sub $64, %r13 je _four_cipher_left_decrypt _decrypt_by_4: - GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ + GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec add $64, %r11 sub $64, %r13 @@ -924,7 +1349,9 @@ _zero_cipher_left_decrypt: # Handle the last <16 byte block seperately paddd ONE(%rip), %xmm0 # increment CNT to get Yn - pshufb SHUF_MASK(%rip), %xmm0 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) sub $16, %r11 add %r13, %r11 @@ -934,14 +1361,17 @@ _zero_cipher_left_decrypt: # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes # (%r13 is the number of bytes in plaintext mod 16) movdqu (%r12), %xmm2 # get the appropriate shuffle mask - pshufb %xmm2, %xmm1 # right shift 16-%r13 butes + PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes + movdqa %xmm1, %xmm2 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 pand %xmm1, %xmm2 - pshufb SHUF_MASK(%rip),%xmm2 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10 ,%xmm2 + pxor %xmm2, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # GHASH computation for the last <16 byte block @@ -949,13 +1379,13 @@ _zero_cipher_left_decrypt: add $16, %r11 # output %r13 bytes - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_decrypt mov %rax, (%arg2 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_decrypt: mov %al, (%arg2, %r11, 1) @@ -968,13 +1398,15 @@ _multiple_of_16_bytes_decrypt: shl $3, %r12 # convert into number of bits movd %r12d, %xmm15 # len(A) in %xmm15 shl $3, %arg4 # len(C) in bits (*128) - movq %arg4, %xmm1 + MOVQ_R64_XMM %arg4, %xmm1 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) pxor %xmm15, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation - pshufb SHUF_MASK(%rip), %xmm8 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm8 + mov %arg5, %rax # %rax = *Y0 movdqu (%rax), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) @@ -987,11 +1419,11 @@ _return_T_decrypt: cmp $12, %r11 je _T_12_decrypt _T_8_decrypt: - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) jmp _return_T_done_decrypt _T_12_decrypt: - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) psrldq $8, %xmm0 movd %xmm0, %eax @@ -1103,7 +1535,9 @@ ENTRY(aesni_gcm_enc) and $~63, %rsp mov %arg6, %r12 movdqu (%r12), %xmm13 - pshufb SHUF_MASK(%rip), %xmm13 + movdqa SHUF_MASK(%rip), %xmm2 + PSHUFB_XMM %xmm2, %xmm13 + # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) @@ -1134,22 +1568,22 @@ ENTRY(aesni_gcm_enc) jb _initial_num_blocks_is_1_encrypt je _initial_num_blocks_is_2_encrypt _initial_num_blocks_is_3_encrypt: - INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc sub $48, %r13 jmp _initial_blocks_encrypted _initial_num_blocks_is_2_encrypt: - INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc sub $32, %r13 jmp _initial_blocks_encrypted _initial_num_blocks_is_1_encrypt: - INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc sub $16, %r13 jmp _initial_blocks_encrypted _initial_num_blocks_is_0_encrypt: - INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ + INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc _initial_blocks_encrypted: @@ -1160,7 +1594,7 @@ _initial_blocks_encrypted: sub $64, %r13 je _four_cipher_left_encrypt _encrypt_by_4_encrypt: - GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ + GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc add $64, %r11 sub $64, %r13 @@ -1175,7 +1609,9 @@ _zero_cipher_left_encrypt: # Handle the last <16 Byte block seperately paddd ONE(%rip), %xmm0 # INCR CNT to get Yn - pshufb SHUF_MASK(%rip), %xmm0 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) sub $16, %r11 add %r13, %r11 @@ -1185,29 +1621,31 @@ _zero_cipher_left_encrypt: # adjust the shuffle mask pointer to be able to shift 16-r13 bytes # (%r13 is the number of bytes in plaintext mod 16) movdqu (%r12), %xmm2 # get the appropriate shuffle mask - pshufb %xmm2, %xmm1 # shift right 16-r13 byte + PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm0 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10,%xmm0 - pshufb SHUF_MASK(%rip),%xmm0 pxor %xmm0, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # GHASH computation for the last <16 byte block sub %r13, %r11 add $16, %r11 - pshufb SHUF_MASK(%rip), %xmm0 + PSHUFB_XMM %xmm10, %xmm1 + # shuffle xmm0 back to output as ciphertext # Output %r13 bytes - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax cmp $8, %r13 jle _less_than_8_bytes_left_encrypt mov %rax, (%arg2 , %r11, 1) add $8, %r11 psrldq $8, %xmm0 - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax sub $8, %r13 _less_than_8_bytes_left_encrypt: mov %al, (%arg2, %r11, 1) @@ -1220,14 +1658,15 @@ _multiple_of_16_bytes_encrypt: shl $3, %r12 movd %r12d, %xmm15 # len(A) in %xmm15 shl $3, %arg4 # len(C) in bits (*128) - movq %arg4, %xmm1 + MOVQ_R64_XMM %arg4, %xmm1 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) pxor %xmm15, %xmm8 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap - pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap mov %arg5, %rax # %rax = *Y0 movdqu (%rax), %xmm0 # %xmm0 = Y0 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) @@ -1240,11 +1679,11 @@ _return_T_encrypt: cmp $12, %r11 je _T_12_encrypt _T_8_encrypt: - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) jmp _return_T_done_encrypt _T_12_encrypt: - movq %xmm0, %rax + MOVQ_R64_XMM %xmm0, %rax mov %rax, (%r10) psrldq $8, %xmm0 movd %xmm0, %eax @@ -1258,6 +1697,7 @@ _return_T_done_encrypt: pop %r13 pop %r12 ret + #endif -- cgit v1.2.3