summary refs log tree commit diff
path: root/arch/arm/crypto
diff options
context:
space:
mode:
authorEric Biggers <ebiggers@google.com>2018-07-24 18:29:07 -0700
committerHerbert Xu <herbert@gondor.apana.org.au>2018-08-03 18:06:05 +0800
commit4e34e51f48ab7f77a4022aa810a786daa3eb3e22 (patch)
treedde19d19ab296050fa3d7a2f362124bdbf7d04f5 /arch/arm/crypto
parentf53ad3e1b35a558528f6d0041568016d1f623a9d (diff)
downloadlinux-4e34e51f48ab7f77a4022aa810a786daa3eb3e22.tar.gz
crypto: arm/chacha20 - always use vrev for 16-bit rotates
The 4-way ChaCha20 NEON code implements 16-bit rotates with vrev32.16,
but the one-way code (used on remainder blocks) implements it with
vshl + vsri, which is slower.  Switch the one-way code to vrev32.16 too.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/arm/crypto')
-rw-r--r--arch/arm/crypto/chacha20-neon-core.S10
1 files changed, 4 insertions, 6 deletions
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S
index 3fecb2124c35..451a849ad518 100644
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -51,9 +51,8 @@ ENTRY(chacha20_block_xor_neon)
 .Ldoubleround:
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	vadd.i32	q0, q0, q1
-	veor		q4, q3, q0
-	vshl.u32	q3, q4, #16
-	vsri.u32	q3, q4, #16
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
 
 	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 	vadd.i32	q2, q2, q3
@@ -82,9 +81,8 @@ ENTRY(chacha20_block_xor_neon)
 
 	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
 	vadd.i32	q0, q0, q1
-	veor		q4, q3, q0
-	vshl.u32	q3, q4, #16
-	vsri.u32	q3, q4, #16
+	veor		q3, q3, q0
+	vrev32.16	q3, q3
 
 	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
 	vadd.i32	q2, q2, q3