diff options
author | Eric Biggers <ebiggers@google.com> | 2018-12-04 22:20:02 -0800 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2018-12-13 18:24:57 +0800 |
commit | 4af78261870a7d36dd222af8dad9688b705e365e (patch) | |
tree | 6013fcbd4c774b15033d09b67dfa7320ec22bc3a /arch/x86/crypto/chacha20-ssse3-x86_64.S | |
parent | 0f961f9f670e7c07690bfde2f533b93c653569cc (diff) | |
download | linux-4af78261870a7d36dd222af8dad9688b705e365e.tar.gz |
crypto: x86/chacha20 - add XChaCha20 support
Add an XChaCha20 implementation that is hooked up to the x86_64 SIMD implementations of ChaCha20. This can be used by Adiantum. An SSSE3 implementation of single-block HChaCha20 is also added so that XChaCha20 can use it rather than the generic implementation. This required refactoring the ChaCha permutation into its own function. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/chacha20-ssse3-x86_64.S')
-rw-r--r-- | arch/x86/crypto/chacha20-ssse3-x86_64.S | 81 |
1 files changed, 56 insertions, 25 deletions
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha20-ssse3-x86_64.S index d8ac75bb448f..f6792789f875 100644 --- a/arch/x86/crypto/chacha20-ssse3-x86_64.S +++ b/arch/x86/crypto/chacha20-ssse3-x86_64.S @@ -10,6 +10,7 @@ */ #include <linux/linkage.h> +#include <asm/frame.h> .section .rodata.cst16.ROT8, "aM", @progbits, 16 .align 16 @@ -23,37 +24,24 @@ CTRINC: .octa 0x00000003000000020000000100000000 .text -ENTRY(chacha20_block_xor_ssse3) - # %rdi: Input state matrix, s - # %rsi: up to 1 data block output, o - # %rdx: up to 1 data block input, i - # %rcx: input/output length in bytes - - # This function encrypts one ChaCha20 block by loading the state matrix - # in four SSE registers. It performs matrix operation on four words in - # parallel, but requires shuffling to rearrange the words after each - # round. 8/16-bit word rotation is done with the slightly better - # performing SSSE3 byte shuffling, 7/12-bit word rotation uses - # traditional shift+OR. - - # x0..3 = s0..3 - movdqa 0x00(%rdi),%xmm0 - movdqa 0x10(%rdi),%xmm1 - movdqa 0x20(%rdi),%xmm2 - movdqa 0x30(%rdi),%xmm3 - movdqa %xmm0,%xmm8 - movdqa %xmm1,%xmm9 - movdqa %xmm2,%xmm10 - movdqa %xmm3,%xmm11 +/* + * chacha20_permute - permute one block + * + * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This + * function performs matrix operations on four words in parallel, but requires + * shuffling to rearrange the words after each round. 8/16-bit word rotation is + * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word + * rotation uses traditional shift+OR. + * + * Clobbers: %ecx, %xmm4-%xmm7 + */ +chacha20_permute: movdqa ROT8(%rip),%xmm4 movdqa ROT16(%rip),%xmm5 - - mov %rcx,%rax mov $10,%ecx .Ldoubleround: - # x0 += x1, x3 = rotl32(x3 ^ x0, 16) paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 @@ -123,6 +111,29 @@ ENTRY(chacha20_block_xor_ssse3) dec %ecx jnz .Ldoubleround + ret +ENDPROC(chacha20_permute) + +ENTRY(chacha20_block_xor_ssse3) + # %rdi: Input state matrix, s + # %rsi: up to 1 data block output, o + # %rdx: up to 1 data block input, i + # %rcx: input/output length in bytes + FRAME_BEGIN + + # x0..3 = s0..3 + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + movdqa %xmm0,%xmm8 + movdqa %xmm1,%xmm9 + movdqa %xmm2,%xmm10 + movdqa %xmm3,%xmm11 + + mov %rcx,%rax + call chacha20_permute + # o0 = i0 ^ (x0 + s0) paddd %xmm8,%xmm0 cmp $0x10,%rax @@ -156,6 +167,7 @@ ENTRY(chacha20_block_xor_ssse3) movdqu %xmm0,0x30(%rsi) .Ldone: + FRAME_END ret .Lxorpart: @@ -189,6 +201,25 @@ ENTRY(chacha20_block_xor_ssse3) ENDPROC(chacha20_block_xor_ssse3) +ENTRY(hchacha20_block_ssse3) + # %rdi: Input state matrix, s + # %rsi: output (8 32-bit words) + FRAME_BEGIN + + movdqa 0x00(%rdi),%xmm0 + movdqa 0x10(%rdi),%xmm1 + movdqa 0x20(%rdi),%xmm2 + movdqa 0x30(%rdi),%xmm3 + + call chacha20_permute + + movdqu %xmm0,0x00(%rsi) + movdqu %xmm3,0x10(%rsi) + + FRAME_END + ret +ENDPROC(hchacha20_block_ssse3) + ENTRY(chacha20_4block_xor_ssse3) # %rdi: Input state matrix, s # %rsi: up to 4 data blocks output, o |