diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-15 12:35:19 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-15 12:35:19 -0800 |
commit | 1ed55eac3b1fc30b29cdb52251e0f13b24fc344c (patch) | |
tree | b7a4c67f2e29f8aa418708c5da871e64c511f3ff /arch/x86/crypto/serpent-avx-x86_64-asm_64.S | |
parent | 08242bc2210938761230f79c5288dbcf72e94808 (diff) | |
parent | a2c0911c09190125f52c9941b9d187f601c2f7be (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
- Added aesni/avx/x86_64 implementations for camellia.
- Optimised AVX code for cast5/serpent/twofish/cast6.
- Fixed vmac bug with unaligned input.
- Allow compression algorithms in FIPS mode.
- Optimised crc32c implementation for Intel.
- Misc fixes.
* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (32 commits)
crypto: caam - Updated SEC-4.0 device tree binding for ERA information.
crypto: testmgr - remove superfluous initializers for xts(aes)
crypto: testmgr - allow compression algs in fips mode
crypto: testmgr - add larger crc32c test vector to test FPU path in crc32c_intel
crypto: testmgr - clean alg_test_null entries in alg_test_descs[]
crypto: testmgr - remove fips_allowed flag from camellia-aesni null-tests
crypto: cast5/cast6 - move lookup tables to shared module
padata: use __this_cpu_read per-cpu helper
crypto: s5p-sss - Fix compilation error
crypto: picoxcell - Add terminating entry for platform_device_id table
crypto: omap-aes - select BLKCIPHER2
crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher
crypto: camellia-x86_64 - share common functions and move structures and function definitions to header file
crypto: tcrypt - add async speed test for camellia cipher
crypto: tegra-aes - fix error-valued pointer dereference
crypto: tegra - fix missing unlock on error case
crypto: cast5/avx - avoid using temporary stack buffers
crypto: serpent/avx - avoid using temporary stack buffers
crypto: twofish/avx - avoid using temporary stack buffers
crypto: cast6/avx - avoid using temporary stack buffers
...
Diffstat (limited to 'arch/x86/crypto/serpent-avx-x86_64-asm_64.S')
-rw-r--r-- | arch/x86/crypto/serpent-avx-x86_64-asm_64.S | 166 |
1 files changed, 108 insertions, 58 deletions
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 504106bf04a2..02b0e9fe997c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S @@ -24,7 +24,16 @@ * */ +#include "glue_helper-asm-avx.S" + .file "serpent-avx-x86_64-asm_64.S" + +.data +.align 16 + +.Lbswap128_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + .text #define CTX %rdi @@ -550,51 +559,27 @@ vpunpcklqdq x3, t2, x2; \ vpunpckhqdq x3, t2, x3; -#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ - vmovdqu (0*4*4)(in), x0; \ - vmovdqu (1*4*4)(in), x1; \ - vmovdqu (2*4*4)(in), x2; \ - vmovdqu (3*4*4)(in), x3; \ - \ +#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ transpose_4x4(x0, x1, x2, x3, t0, t1, t2) -#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ - transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ - \ - vmovdqu x0, (0*4*4)(out); \ - vmovdqu x1, (1*4*4)(out); \ - vmovdqu x2, (2*4*4)(out); \ - vmovdqu x3, (3*4*4)(out); - -#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ - transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ - \ - vpxor (0*4*4)(out), x0, x0; \ - vmovdqu x0, (0*4*4)(out); \ - vpxor (1*4*4)(out), x1, x1; \ - vmovdqu x1, (1*4*4)(out); \ - vpxor (2*4*4)(out), x2, x2; \ - vmovdqu x2, (2*4*4)(out); \ - vpxor (3*4*4)(out), x3, x3; \ - vmovdqu x3, (3*4*4)(out); +#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ + transpose_4x4(x0, x1, x2, x3, t0, t1, t2) .align 8 -.global __serpent_enc_blk_8way_avx -.type __serpent_enc_blk_8way_avx,@function; +.type __serpent_enc_blk8_avx,@function; -__serpent_enc_blk_8way_avx: +__serpent_enc_blk8_avx: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src - * %rcx: bool, if true: xor output + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks + * output: + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks */ vpcmpeqd RNOT, RNOT, RNOT; - leaq (4*4*4)(%rdx), %rax; - read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); K2(RA, RB, RC, RD, RE, 0); S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); @@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx: S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); - leaq (4*4*4)(%rsi), %rax; - - testb %cl, %cl; - jnz __enc_xor8; - - write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); - - ret; - -__enc_xor8: - xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); ret; .align 8 -.global serpent_dec_blk_8way_avx -.type serpent_dec_blk_8way_avx,@function; +.type __serpent_dec_blk8_avx,@function; -serpent_dec_blk_8way_avx: +__serpent_dec_blk8_avx: /* input: * %rdi: ctx, CTX - * %rsi: dst - * %rdx: src + * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks + * output: + * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks */ vpcmpeqd RNOT, RNOT, RNOT; - leaq (4*4*4)(%rdx), %rax; - read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); - read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); + read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); + read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); K2(RA, RB, RC, RD, RE, 32); SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); @@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx: SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); - leaq (4*4*4)(%rsi), %rax; - write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); - write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); + write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); + write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); + + ret; + +.align 8 +.global serpent_ecb_enc_8way_avx +.type serpent_ecb_enc_8way_avx,@function; + +serpent_ecb_enc_8way_avx: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_enc_blk8_avx; + + store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + ret; + +.align 8 +.global serpent_ecb_dec_8way_avx +.type serpent_ecb_dec_8way_avx,@function; + +serpent_ecb_dec_8way_avx: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_dec_blk8_avx; + + store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + + ret; + +.align 8 +.global serpent_cbc_dec_8way_avx +.type serpent_cbc_dec_8way_avx,@function; + +serpent_cbc_dec_8way_avx: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + + load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); + + call __serpent_dec_blk8_avx; + + store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); + + ret; + +.align 8 +.global serpent_ctr_8way_avx +.type serpent_ctr_8way_avx,@function; + +serpent_ctr_8way_avx: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: iv (little endian, 128bit) + */ + + load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, + RD2, RK0, RK1, RK2); + + call __serpent_enc_blk8_avx; + + store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); ret; |