summary refs log tree commit diff
path: root/arch/sparc
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc')
-rw-r--r--arch/sparc/Kbuild1
-rw-r--r--arch/sparc/crypto/Makefile25
-rw-r--r--arch/sparc/crypto/aes_asm.S1535
-rw-r--r--arch/sparc/crypto/aes_glue.c477
-rw-r--r--arch/sparc/crypto/camellia_asm.S563
-rw-r--r--arch/sparc/crypto/camellia_glue.c322
-rw-r--r--arch/sparc/crypto/crc32c_asm.S20
-rw-r--r--arch/sparc/crypto/crc32c_glue.c179
-rw-r--r--arch/sparc/crypto/crop_devid.c14
-rw-r--r--arch/sparc/crypto/des_asm.S418
-rw-r--r--arch/sparc/crypto/des_glue.c529
-rw-r--r--arch/sparc/crypto/md5_asm.S70
-rw-r--r--arch/sparc/crypto/md5_glue.c188
-rw-r--r--arch/sparc/crypto/opcodes.h99
-rw-r--r--arch/sparc/crypto/sha1_asm.S72
-rw-r--r--arch/sparc/crypto/sha1_glue.c183
-rw-r--r--arch/sparc/crypto/sha256_asm.S78
-rw-r--r--arch/sparc/crypto/sha256_glue.c241
-rw-r--r--arch/sparc/crypto/sha512_asm.S102
-rw-r--r--arch/sparc/crypto/sha512_glue.c226
-rw-r--r--arch/sparc/include/asm/asi.h4
-rw-r--r--arch/sparc/include/asm/elf_64.h9
-rw-r--r--arch/sparc/include/asm/hypervisor.h11
-rw-r--r--arch/sparc/include/asm/mdesc.h1
-rw-r--r--arch/sparc/include/asm/pcr.h36
-rw-r--r--arch/sparc/include/asm/perfctr.h30
-rw-r--r--arch/sparc/include/asm/pstate.h14
-rw-r--r--arch/sparc/kernel/head_64.S14
-rw-r--r--arch/sparc/kernel/hvapi.c1
-rw-r--r--arch/sparc/kernel/hvcalls.S16
-rw-r--r--arch/sparc/kernel/ktlb.S25
-rw-r--r--arch/sparc/kernel/mdesc.c24
-rw-r--r--arch/sparc/kernel/nmi.c21
-rw-r--r--arch/sparc/kernel/pci_sun4v.c2
-rw-r--r--arch/sparc/kernel/pcr.c172
-rw-r--r--arch/sparc/kernel/perf_event.c516
-rw-r--r--arch/sparc/kernel/setup_64.c67
-rw-r--r--arch/sparc/lib/Makefile3
-rw-r--r--arch/sparc/lib/NG4copy_from_user.S30
-rw-r--r--arch/sparc/lib/NG4copy_page.S57
-rw-r--r--arch/sparc/lib/NG4copy_to_user.S39
-rw-r--r--arch/sparc/lib/NG4memcpy.S360
-rw-r--r--arch/sparc/lib/NG4patch.S43
-rw-r--r--arch/sparc/lib/NGpage.S2
-rw-r--r--arch/sparc/lib/ksyms.c4
-rw-r--r--arch/sparc/mm/init_64.c230
-rw-r--r--arch/sparc/mm/init_64.h4
47 files changed, 6814 insertions, 263 deletions
diff --git a/arch/sparc/Kbuild b/arch/sparc/Kbuild
index 5cd01161fd00..675afa285ddb 100644
--- a/arch/sparc/Kbuild
+++ b/arch/sparc/Kbuild
@@ -6,3 +6,4 @@ obj-y += kernel/
 obj-y += mm/
 obj-y += math-emu/
 obj-y += net/
+obj-y += crypto/
diff --git a/arch/sparc/crypto/Makefile b/arch/sparc/crypto/Makefile
new file mode 100644
index 000000000000..6ae1ad5e502b
--- /dev/null
+++ b/arch/sparc/crypto/Makefile
@@ -0,0 +1,25 @@
+#
+# Arch-specific CryptoAPI modules.
+#
+
+obj-$(CONFIG_CRYPTO_SHA1_SPARC64) += sha1-sparc64.o
+obj-$(CONFIG_CRYPTO_SHA256_SPARC64) += sha256-sparc64.o
+obj-$(CONFIG_CRYPTO_SHA512_SPARC64) += sha512-sparc64.o
+obj-$(CONFIG_CRYPTO_MD5_SPARC64) += md5-sparc64.o
+
+obj-$(CONFIG_CRYPTO_AES_SPARC64) += aes-sparc64.o
+obj-$(CONFIG_CRYPTO_DES_SPARC64) += des-sparc64.o
+obj-$(CONFIG_CRYPTO_DES_SPARC64) += camellia-sparc64.o
+
+obj-$(CONFIG_CRYPTO_CRC32C_SPARC64) += crc32c-sparc64.o
+
+sha1-sparc64-y := sha1_asm.o sha1_glue.o crop_devid.o
+sha256-sparc64-y := sha256_asm.o sha256_glue.o crop_devid.o
+sha512-sparc64-y := sha512_asm.o sha512_glue.o crop_devid.o
+md5-sparc64-y := md5_asm.o md5_glue.o crop_devid.o
+
+aes-sparc64-y := aes_asm.o aes_glue.o crop_devid.o
+des-sparc64-y := des_asm.o des_glue.o crop_devid.o
+camellia-sparc64-y := camellia_asm.o camellia_glue.o crop_devid.o
+
+crc32c-sparc64-y := crc32c_asm.o crc32c_glue.o crop_devid.o
diff --git a/arch/sparc/crypto/aes_asm.S b/arch/sparc/crypto/aes_asm.S
new file mode 100644
index 000000000000..23f6cbb910d3
--- /dev/null
+++ b/arch/sparc/crypto/aes_asm.S
@@ -0,0 +1,1535 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+#define ENCRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23(KEY_BASE +  6, T0, T1, I1)
+
+#define ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01(KEY_BASE +  0, I2, I3, T2) \
+	AES_EROUND23(KEY_BASE +  2, I2, I3, T3) \
+	AES_EROUND01(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23(KEY_BASE +  6, T0, T1, I1) \
+	AES_EROUND01(KEY_BASE +  4, T2, T3, I2) \
+	AES_EROUND23(KEY_BASE +  6, T2, T3, I3)
+
+#define ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01_L(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23_L(KEY_BASE +  6, T0, T1, I1)
+
+#define ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_EROUND01(KEY_BASE +  0, I0, I1, T0) \
+	AES_EROUND23(KEY_BASE +  2, I0, I1, T1) \
+	AES_EROUND01(KEY_BASE +  0, I2, I3, T2) \
+	AES_EROUND23(KEY_BASE +  2, I2, I3, T3) \
+	AES_EROUND01_L(KEY_BASE +  4, T0, T1, I0) \
+	AES_EROUND23_L(KEY_BASE +  6, T0, T1, I1) \
+	AES_EROUND01_L(KEY_BASE +  4, T2, T3, I2) \
+	AES_EROUND23_L(KEY_BASE +  6, T2, T3, I3)
+
+	/* 10 rounds */
+#define ENCRYPT_128(KEY_BASE, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1)
+
+#define ENCRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 12 rounds */
+#define ENCRYPT_192(KEY_BASE, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1)
+
+#define ENCRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \
+	ENCRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 14 rounds */
+#define ENCRYPT_256(KEY_BASE, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \
+	ENCRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1)
+
+#define ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \
+	ENCRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \
+			     TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE + 6)
+
+#define ENCRYPT_256_2(KEY_BASE, I0, I1, I2, I3) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, KEY_BASE + 48) \
+	ldd	[%o0 + 0xd0], %f56; \
+	ldd	[%o0 + 0xd8], %f58; \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, KEY_BASE +  0) \
+	ldd	[%o0 + 0xe0], %f60; \
+	ldd	[%o0 + 0xe8], %f62; \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE +  0) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE +  0) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE +  0) \
+	ENCRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE +  0) \
+	AES_EROUND01(KEY_BASE +  48, I0, I1, KEY_BASE + 0) \
+	AES_EROUND23(KEY_BASE +  50, I0, I1, KEY_BASE + 2) \
+	AES_EROUND01(KEY_BASE +  48, I2, I3, KEY_BASE + 4) \
+	AES_EROUND23(KEY_BASE +  50, I2, I3, KEY_BASE + 6) \
+	AES_EROUND01_L(KEY_BASE +  52, KEY_BASE + 0, KEY_BASE + 2, I0) \
+	AES_EROUND23_L(KEY_BASE +  54, KEY_BASE + 0, KEY_BASE + 2, I1) \
+	ldd	[%o0 + 0x10], %f8; \
+	ldd	[%o0 + 0x18], %f10; \
+	AES_EROUND01_L(KEY_BASE +  52, KEY_BASE + 4, KEY_BASE + 6, I2) \
+	AES_EROUND23_L(KEY_BASE +  54, KEY_BASE + 4, KEY_BASE + 6, I3) \
+	ldd	[%o0 + 0x20], %f12; \
+	ldd	[%o0 + 0x28], %f14;
+
+#define DECRYPT_TWO_ROUNDS(KEY_BASE, I0, I1, T0, T1) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01(KEY_BASE +  6, T0, T1, I0)
+
+#define DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23(KEY_BASE +  0, I2, I3, T3) \
+	AES_DROUND01(KEY_BASE +  2, I2, I3, T2) \
+	AES_DROUND23(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01(KEY_BASE +  6, T0, T1, I0) \
+	AES_DROUND23(KEY_BASE +  4, T2, T3, I3) \
+	AES_DROUND01(KEY_BASE +  6, T2, T3, I2)
+
+#define DECRYPT_TWO_ROUNDS_LAST(KEY_BASE, I0, I1, T0, T1) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23_L(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01_L(KEY_BASE +  6, T0, T1, I0)
+
+#define DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	AES_DROUND23(KEY_BASE +  0, I0, I1, T1) \
+	AES_DROUND01(KEY_BASE +  2, I0, I1, T0) \
+	AES_DROUND23(KEY_BASE +  0, I2, I3, T3) \
+	AES_DROUND01(KEY_BASE +  2, I2, I3, T2) \
+	AES_DROUND23_L(KEY_BASE +  4, T0, T1, I1) \
+	AES_DROUND01_L(KEY_BASE +  6, T0, T1, I0) \
+	AES_DROUND23_L(KEY_BASE +  4, T2, T3, I3) \
+	AES_DROUND01_L(KEY_BASE +  6, T2, T3, I2)
+
+	/* 10 rounds */
+#define DECRYPT_128(KEY_BASE, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 32, I0, I1, T0, T1)
+
+#define DECRYPT_128_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 12 rounds */
+#define DECRYPT_192(KEY_BASE, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 40, I0, I1, T0, T1)
+
+#define DECRYPT_192_2(KEY_BASE, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, T0, T1, T2, T3) \
+	DECRYPT_TWO_ROUNDS_LAST_2(KEY_BASE + 40, I0, I1, I2, I3, T0, T1, T2, T3)
+
+	/* 14 rounds */
+#define DECRYPT_256(KEY_BASE, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  0, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE +  8, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 16, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 24, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 32, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS(KEY_BASE + 40, I0, I1, T0, T1) \
+	DECRYPT_TWO_ROUNDS_LAST(KEY_BASE + 48, I0, I1, T0, T1)
+
+#define DECRYPT_256_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, TMP_BASE) \
+	DECRYPT_TWO_ROUNDS_2(KEY_BASE, I0, I1, I2, I3, \
+			     TMP_BASE + 0, TMP_BASE + 2, TMP_BASE + 4, TMP_BASE + 6)
+
+#define DECRYPT_256_2(KEY_BASE, I0, I1, I2, I3) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE +  0, I0, I1, I2, I3, KEY_BASE + 48) \
+	ldd	[%o0 + 0x18], %f56; \
+	ldd	[%o0 + 0x10], %f58; \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE +  8, I0, I1, I2, I3, KEY_BASE +  0) \
+	ldd	[%o0 + 0x08], %f60; \
+	ldd	[%o0 + 0x00], %f62; \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 16, I0, I1, I2, I3, KEY_BASE +  0) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 24, I0, I1, I2, I3, KEY_BASE +  0) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 32, I0, I1, I2, I3, KEY_BASE +  0) \
+	DECRYPT_256_TWO_ROUNDS_2(KEY_BASE + 40, I0, I1, I2, I3, KEY_BASE +  0) \
+	AES_DROUND23(KEY_BASE +  48, I0, I1, KEY_BASE + 2) \
+	AES_DROUND01(KEY_BASE +  50, I0, I1, KEY_BASE + 0) \
+	AES_DROUND23(KEY_BASE +  48, I2, I3, KEY_BASE + 6) \
+	AES_DROUND01(KEY_BASE +  50, I2, I3, KEY_BASE + 4) \
+	AES_DROUND23_L(KEY_BASE +  52, KEY_BASE + 0, KEY_BASE + 2, I1) \
+	AES_DROUND01_L(KEY_BASE +  54, KEY_BASE + 0, KEY_BASE + 2, I0) \
+	ldd	[%o0 + 0xd8], %f8; \
+	ldd	[%o0 + 0xd0], %f10; \
+	AES_DROUND23_L(KEY_BASE +  52, KEY_BASE + 4, KEY_BASE + 6, I3) \
+	AES_DROUND01_L(KEY_BASE +  54, KEY_BASE + 4, KEY_BASE + 6, I2) \
+	ldd	[%o0 + 0xc8], %f12; \
+	ldd	[%o0 + 0xc0], %f14;
+
+	.align	32
+ENTRY(aes_sparc64_key_expand)
+	/* %o0=input_key, %o1=output_key, %o2=key_len */
+	VISEntry
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	ld	[%o0 + 0x0c], %f3
+
+	std	%f0, [%o1 + 0x00]
+	std	%f2, [%o1 + 0x08]
+	add	%o1, 0x10, %o1
+
+	cmp	%o2, 24
+	bl	2f
+	 nop
+
+	be	1f
+	 nop
+
+	/* 256-bit key expansion */
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+	ld	[%o0 + 0x18], %f6
+	ld	[%o0 + 0x1c], %f7
+
+	std	%f4, [%o1 + 0x00]
+	std	%f6, [%o1 + 0x08]
+	add	%o1, 0x10, %o1
+
+	AES_KEXPAND1(0, 6, 0x0, 8)
+	AES_KEXPAND2(2, 8, 10)
+	AES_KEXPAND0(4, 10, 12)
+	AES_KEXPAND2(6, 12, 14)
+	AES_KEXPAND1(8, 14, 0x1, 16)
+	AES_KEXPAND2(10, 16, 18)
+	AES_KEXPAND0(12, 18, 20)
+	AES_KEXPAND2(14, 20, 22)
+	AES_KEXPAND1(16, 22, 0x2, 24)
+	AES_KEXPAND2(18, 24, 26)
+	AES_KEXPAND0(20, 26, 28)
+	AES_KEXPAND2(22, 28, 30)
+	AES_KEXPAND1(24, 30, 0x3, 32)
+	AES_KEXPAND2(26, 32, 34)
+	AES_KEXPAND0(28, 34, 36)
+	AES_KEXPAND2(30, 36, 38)
+	AES_KEXPAND1(32, 38, 0x4, 40)
+	AES_KEXPAND2(34, 40, 42)
+	AES_KEXPAND0(36, 42, 44)
+	AES_KEXPAND2(38, 44, 46)
+	AES_KEXPAND1(40, 46, 0x5, 48)
+	AES_KEXPAND2(42, 48, 50)
+	AES_KEXPAND0(44, 50, 52)
+	AES_KEXPAND2(46, 52, 54)
+	AES_KEXPAND1(48, 54, 0x6, 56)
+	AES_KEXPAND2(50, 56, 58)
+
+	std	%f8, [%o1 + 0x00]
+	std	%f10, [%o1 + 0x08]
+	std	%f12, [%o1 + 0x10]
+	std	%f14, [%o1 + 0x18]
+	std	%f16, [%o1 + 0x20]
+	std	%f18, [%o1 + 0x28]
+	std	%f20, [%o1 + 0x30]
+	std	%f22, [%o1 + 0x38]
+	std	%f24, [%o1 + 0x40]
+	std	%f26, [%o1 + 0x48]
+	std	%f28, [%o1 + 0x50]
+	std	%f30, [%o1 + 0x58]
+	std	%f32, [%o1 + 0x60]
+	std	%f34, [%o1 + 0x68]
+	std	%f36, [%o1 + 0x70]
+	std	%f38, [%o1 + 0x78]
+	std	%f40, [%o1 + 0x80]
+	std	%f42, [%o1 + 0x88]
+	std	%f44, [%o1 + 0x90]
+	std	%f46, [%o1 + 0x98]
+	std	%f48, [%o1 + 0xa0]
+	std	%f50, [%o1 + 0xa8]
+	std	%f52, [%o1 + 0xb0]
+	std	%f54, [%o1 + 0xb8]
+	std	%f56, [%o1 + 0xc0]
+	ba,pt	%xcc, 80f
+	 std	%f58, [%o1 + 0xc8]
+
+1:	
+	/* 192-bit key expansion */
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+
+	std	%f4, [%o1 + 0x00]
+	add	%o1, 0x08, %o1
+
+	AES_KEXPAND1(0, 4, 0x0, 6)
+	AES_KEXPAND2(2, 6, 8)
+	AES_KEXPAND2(4, 8, 10)
+	AES_KEXPAND1(6, 10, 0x1, 12)
+	AES_KEXPAND2(8, 12, 14)
+	AES_KEXPAND2(10, 14, 16)
+	AES_KEXPAND1(12, 16, 0x2, 18)
+	AES_KEXPAND2(14, 18, 20)
+	AES_KEXPAND2(16, 20, 22)
+	AES_KEXPAND1(18, 22, 0x3, 24)
+	AES_KEXPAND2(20, 24, 26)
+	AES_KEXPAND2(22, 26, 28)
+	AES_KEXPAND1(24, 28, 0x4, 30)
+	AES_KEXPAND2(26, 30, 32)
+	AES_KEXPAND2(28, 32, 34)
+	AES_KEXPAND1(30, 34, 0x5, 36)
+	AES_KEXPAND2(32, 36, 38)
+	AES_KEXPAND2(34, 38, 40)
+	AES_KEXPAND1(36, 40, 0x6, 42)
+	AES_KEXPAND2(38, 42, 44)
+	AES_KEXPAND2(40, 44, 46)
+	AES_KEXPAND1(42, 46, 0x7, 48)
+	AES_KEXPAND2(44, 48, 50)
+
+	std	%f6, [%o1 + 0x00]
+	std	%f8, [%o1 + 0x08]
+	std	%f10, [%o1 + 0x10]
+	std	%f12, [%o1 + 0x18]
+	std	%f14, [%o1 + 0x20]
+	std	%f16, [%o1 + 0x28]
+	std	%f18, [%o1 + 0x30]
+	std	%f20, [%o1 + 0x38]
+	std	%f22, [%o1 + 0x40]
+	std	%f24, [%o1 + 0x48]
+	std	%f26, [%o1 + 0x50]
+	std	%f28, [%o1 + 0x58]
+	std	%f30, [%o1 + 0x60]
+	std	%f32, [%o1 + 0x68]
+	std	%f34, [%o1 + 0x70]
+	std	%f36, [%o1 + 0x78]
+	std	%f38, [%o1 + 0x80]
+	std	%f40, [%o1 + 0x88]
+	std	%f42, [%o1 + 0x90]
+	std	%f44, [%o1 + 0x98]
+	std	%f46, [%o1 + 0xa0]
+	std	%f48, [%o1 + 0xa8]
+	ba,pt	%xcc, 80f
+	 std	%f50, [%o1 + 0xb0]
+
+2:
+	/* 128-bit key expansion */
+	AES_KEXPAND1(0, 2, 0x0, 4)
+	AES_KEXPAND2(2, 4, 6)
+	AES_KEXPAND1(4, 6, 0x1, 8)
+	AES_KEXPAND2(6, 8, 10)
+	AES_KEXPAND1(8, 10, 0x2, 12)
+	AES_KEXPAND2(10, 12, 14)
+	AES_KEXPAND1(12, 14, 0x3, 16)
+	AES_KEXPAND2(14, 16, 18)
+	AES_KEXPAND1(16, 18, 0x4, 20)
+	AES_KEXPAND2(18, 20, 22)
+	AES_KEXPAND1(20, 22, 0x5, 24)
+	AES_KEXPAND2(22, 24, 26)
+	AES_KEXPAND1(24, 26, 0x6, 28)
+	AES_KEXPAND2(26, 28, 30)
+	AES_KEXPAND1(28, 30, 0x7, 32)
+	AES_KEXPAND2(30, 32, 34)
+	AES_KEXPAND1(32, 34, 0x8, 36)
+	AES_KEXPAND2(34, 36, 38)
+	AES_KEXPAND1(36, 38, 0x9, 40)
+	AES_KEXPAND2(38, 40, 42)
+
+	std	%f4, [%o1 + 0x00]
+	std	%f6, [%o1 + 0x08]
+	std	%f8, [%o1 + 0x10]
+	std	%f10, [%o1 + 0x18]
+	std	%f12, [%o1 + 0x20]
+	std	%f14, [%o1 + 0x28]
+	std	%f16, [%o1 + 0x30]
+	std	%f18, [%o1 + 0x38]
+	std	%f20, [%o1 + 0x40]
+	std	%f22, [%o1 + 0x48]
+	std	%f24, [%o1 + 0x50]
+	std	%f26, [%o1 + 0x58]
+	std	%f28, [%o1 + 0x60]
+	std	%f30, [%o1 + 0x68]
+	std	%f32, [%o1 + 0x70]
+	std	%f34, [%o1 + 0x78]
+	std	%f36, [%o1 + 0x80]
+	std	%f38, [%o1 + 0x88]
+	std	%f40, [%o1 + 0x90]
+	std	%f42, [%o1 + 0x98]
+80:
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_key_expand)
+
+	.align		32
+ENTRY(aes_sparc64_encrypt_128)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0x00], %f8
+	ldd		[%o0 + 0x08], %f10
+	ldd		[%o0 + 0x10], %f12
+	ldd		[%o0 + 0x18], %f14
+	ldd		[%o0 + 0x20], %f16
+	ldd		[%o0 + 0x28], %f18
+	ldd		[%o0 + 0x30], %f20
+	ldd		[%o0 + 0x38], %f22
+	ldd		[%o0 + 0x40], %f24
+	ldd		[%o0 + 0x48], %f26
+	ldd		[%o0 + 0x50], %f28
+	ldd		[%o0 + 0x58], %f30
+	ldd		[%o0 + 0x60], %f32
+	ldd		[%o0 + 0x68], %f34
+	ldd		[%o0 + 0x70], %f36
+	ldd		[%o0 + 0x78], %f38
+	ldd		[%o0 + 0x80], %f40
+	ldd		[%o0 + 0x88], %f42
+	ldd		[%o0 + 0x90], %f44
+	ldd		[%o0 + 0x98], %f46
+	ldd		[%o0 + 0xa0], %f48
+	ldd		[%o0 + 0xa8], %f50
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+	ENCRYPT_128(12, 4, 6, 0, 2)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_encrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_encrypt_192)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+
+	ldd		[%o0 + 0x00], %f8
+	ldd		[%o0 + 0x08], %f10
+
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	add		%o0, 0x20, %o0
+
+	ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+	ldd		[%o0 + 0x10], %f12
+	ldd		[%o0 + 0x18], %f14
+	ldd		[%o0 + 0x20], %f16
+	ldd		[%o0 + 0x28], %f18
+	ldd		[%o0 + 0x30], %f20
+	ldd		[%o0 + 0x38], %f22
+	ldd		[%o0 + 0x40], %f24
+	ldd		[%o0 + 0x48], %f26
+	ldd		[%o0 + 0x50], %f28
+	ldd		[%o0 + 0x58], %f30
+	ldd		[%o0 + 0x60], %f32
+	ldd		[%o0 + 0x68], %f34
+	ldd		[%o0 + 0x70], %f36
+	ldd		[%o0 + 0x78], %f38
+	ldd		[%o0 + 0x80], %f40
+	ldd		[%o0 + 0x88], %f42
+	ldd		[%o0 + 0x90], %f44
+	ldd		[%o0 + 0x98], %f46
+	ldd		[%o0 + 0xa0], %f48
+	ldd		[%o0 + 0xa8], %f50
+
+
+	ENCRYPT_128(12, 4, 6, 0, 2)
+
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_encrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_encrypt_256)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+
+	ldd		[%o0 + 0x00], %f8
+	ldd		[%o0 + 0x08], %f10
+
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+
+	ldd		[%o0 + 0x10], %f8
+
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	add		%o0, 0x20, %o0
+
+	ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+	ldd		[%o0 + 0x10], %f8
+
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	add		%o0, 0x20, %o0
+
+	ENCRYPT_TWO_ROUNDS(8, 4, 6, 0, 2)
+
+	ldd		[%o0 + 0x10], %f12
+	ldd		[%o0 + 0x18], %f14
+	ldd		[%o0 + 0x20], %f16
+	ldd		[%o0 + 0x28], %f18
+	ldd		[%o0 + 0x30], %f20
+	ldd		[%o0 + 0x38], %f22
+	ldd		[%o0 + 0x40], %f24
+	ldd		[%o0 + 0x48], %f26
+	ldd		[%o0 + 0x50], %f28
+	ldd		[%o0 + 0x58], %f30
+	ldd		[%o0 + 0x60], %f32
+	ldd		[%o0 + 0x68], %f34
+	ldd		[%o0 + 0x70], %f36
+	ldd		[%o0 + 0x78], %f38
+	ldd		[%o0 + 0x80], %f40
+	ldd		[%o0 + 0x88], %f42
+	ldd		[%o0 + 0x90], %f44
+	ldd		[%o0 + 0x98], %f46
+	ldd		[%o0 + 0xa0], %f48
+	ldd		[%o0 + 0xa8], %f50
+
+	ENCRYPT_128(12, 4, 6, 0, 2)
+
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_encrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_decrypt_128)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0xa0], %f8
+	ldd		[%o0 + 0xa8], %f10
+	ldd		[%o0 + 0x98], %f12
+	ldd		[%o0 + 0x90], %f14
+	ldd		[%o0 + 0x88], %f16
+	ldd		[%o0 + 0x80], %f18
+	ldd		[%o0 + 0x78], %f20
+	ldd		[%o0 + 0x70], %f22
+	ldd		[%o0 + 0x68], %f24
+	ldd		[%o0 + 0x60], %f26
+	ldd		[%o0 + 0x58], %f28
+	ldd		[%o0 + 0x50], %f30
+	ldd		[%o0 + 0x48], %f32
+	ldd		[%o0 + 0x40], %f34
+	ldd		[%o0 + 0x38], %f36
+	ldd		[%o0 + 0x30], %f38
+	ldd		[%o0 + 0x28], %f40
+	ldd		[%o0 + 0x20], %f42
+	ldd		[%o0 + 0x18], %f44
+	ldd		[%o0 + 0x10], %f46
+	ldd		[%o0 + 0x08], %f48
+	ldd		[%o0 + 0x00], %f50
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+	DECRYPT_128(12, 4, 6, 0, 2)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_decrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_decrypt_192)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0xc0], %f8
+	ldd		[%o0 + 0xc8], %f10
+	ldd		[%o0 + 0xb8], %f12
+	ldd		[%o0 + 0xb0], %f14
+	ldd		[%o0 + 0xa8], %f16
+	ldd		[%o0 + 0xa0], %f18
+	fxor		%f8, %f4, %f4
+	fxor		%f10, %f6, %f6
+	ldd		[%o0 + 0x98], %f20
+	ldd		[%o0 + 0x90], %f22
+	ldd		[%o0 + 0x88], %f24
+	ldd		[%o0 + 0x80], %f26
+	DECRYPT_TWO_ROUNDS(12, 4, 6, 0, 2)
+	ldd		[%o0 + 0x78], %f28
+	ldd		[%o0 + 0x70], %f30
+	ldd		[%o0 + 0x68], %f32
+	ldd		[%o0 + 0x60], %f34
+	ldd		[%o0 + 0x58], %f36
+	ldd		[%o0 + 0x50], %f38
+	ldd		[%o0 + 0x48], %f40
+	ldd		[%o0 + 0x40], %f42
+	ldd		[%o0 + 0x38], %f44
+	ldd		[%o0 + 0x30], %f46
+	ldd		[%o0 + 0x28], %f48
+	ldd		[%o0 + 0x20], %f50
+	ldd		[%o0 + 0x18], %f52
+	ldd		[%o0 + 0x10], %f54
+	ldd		[%o0 + 0x08], %f56
+	ldd		[%o0 + 0x00], %f58
+	DECRYPT_128(20, 4, 6, 0, 2)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_decrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_decrypt_256)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ld		[%o1 + 0x00], %f4
+	ld		[%o1 + 0x04], %f5
+	ld		[%o1 + 0x08], %f6
+	ld		[%o1 + 0x0c], %f7
+	ldd		[%o0 + 0xe0], %f8
+	ldd		[%o0 + 0xe8], %f10
+	ldd		[%o0 + 0xd8], %f12
+	ldd		[%o0 + 0xd0], %f14
+	ldd		[%o0 + 0xc8], %f16
+	fxor		%f8, %f4, %f4
+	ldd		[%o0 + 0xc0], %f18
+	fxor		%f10, %f6, %f6
+	ldd		[%o0 + 0xb8], %f20
+	AES_DROUND23(12, 4, 6, 2)
+	ldd		[%o0 + 0xb0], %f22
+	AES_DROUND01(14, 4, 6, 0)
+	ldd		[%o0 + 0xa8], %f24
+	AES_DROUND23(16, 0, 2, 6)
+	ldd		[%o0 + 0xa0], %f26
+	AES_DROUND01(18, 0, 2, 4)
+	ldd		[%o0 + 0x98], %f12
+	AES_DROUND23(20, 4, 6, 2)
+	ldd		[%o0 + 0x90], %f14
+	AES_DROUND01(22, 4, 6, 0)
+	ldd		[%o0 + 0x88], %f16
+	AES_DROUND23(24, 0, 2, 6)
+	ldd		[%o0 + 0x80], %f18
+	AES_DROUND01(26, 0, 2, 4)
+	ldd		[%o0 + 0x78], %f20
+	AES_DROUND23(12, 4, 6, 2)
+	ldd		[%o0 + 0x70], %f22
+	AES_DROUND01(14, 4, 6, 0)
+	ldd		[%o0 + 0x68], %f24
+	AES_DROUND23(16, 0, 2, 6)
+	ldd		[%o0 + 0x60], %f26
+	AES_DROUND01(18, 0, 2, 4)
+	ldd		[%o0 + 0x58], %f28
+	AES_DROUND23(20, 4, 6, 2)
+	ldd		[%o0 + 0x50], %f30
+	AES_DROUND01(22, 4, 6, 0)
+	ldd		[%o0 + 0x48], %f32
+	AES_DROUND23(24, 0, 2, 6)
+	ldd		[%o0 + 0x40], %f34
+	AES_DROUND01(26, 0, 2, 4)
+	ldd		[%o0 + 0x38], %f36
+	AES_DROUND23(28, 4, 6, 2)
+	ldd		[%o0 + 0x30], %f38
+	AES_DROUND01(30, 4, 6, 0)
+	ldd		[%o0 + 0x28], %f40
+	AES_DROUND23(32, 0, 2, 6)
+	ldd		[%o0 + 0x20], %f42
+	AES_DROUND01(34, 0, 2, 4)
+	ldd		[%o0 + 0x18], %f44
+	AES_DROUND23(36, 4, 6, 2)
+	ldd		[%o0 + 0x10], %f46
+	AES_DROUND01(38, 4, 6, 0)
+	ldd		[%o0 + 0x08], %f48
+	AES_DROUND23(40, 0, 2, 6)
+	ldd		[%o0 + 0x00], %f50
+	AES_DROUND01(42, 0, 2, 4)
+	AES_DROUND23(44, 4, 6, 2)
+	AES_DROUND01(46, 4, 6, 0)
+	AES_DROUND23_L(48, 0, 2, 6)
+	AES_DROUND01_L(50, 0, 2, 4)
+	st		%f4, [%o2 + 0x00]
+	st		%f5, [%o2 + 0x04]
+	st		%f6, [%o2 + 0x08]
+	st		%f7, [%o2 + 0x0c]
+	retl
+	 VISExit
+ENDPROC(aes_sparc64_decrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_load_encrypt_keys_128)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	ldd		[%o0 + 0x30], %f16
+	ldd		[%o0 + 0x38], %f18
+	ldd		[%o0 + 0x40], %f20
+	ldd		[%o0 + 0x48], %f22
+	ldd		[%o0 + 0x50], %f24
+	ldd		[%o0 + 0x58], %f26
+	ldd		[%o0 + 0x60], %f28
+	ldd		[%o0 + 0x68], %f30
+	ldd		[%o0 + 0x70], %f32
+	ldd		[%o0 + 0x78], %f34
+	ldd		[%o0 + 0x80], %f36
+	ldd		[%o0 + 0x88], %f38
+	ldd		[%o0 + 0x90], %f40
+	ldd		[%o0 + 0x98], %f42
+	ldd		[%o0 + 0xa0], %f44
+	retl
+	 ldd		[%o0 + 0xa8], %f46
+ENDPROC(aes_sparc64_load_encrypt_keys_128)
+
+	.align		32
+ENTRY(aes_sparc64_load_encrypt_keys_192)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	ldd		[%o0 + 0x30], %f16
+	ldd		[%o0 + 0x38], %f18
+	ldd		[%o0 + 0x40], %f20
+	ldd		[%o0 + 0x48], %f22
+	ldd		[%o0 + 0x50], %f24
+	ldd		[%o0 + 0x58], %f26
+	ldd		[%o0 + 0x60], %f28
+	ldd		[%o0 + 0x68], %f30
+	ldd		[%o0 + 0x70], %f32
+	ldd		[%o0 + 0x78], %f34
+	ldd		[%o0 + 0x80], %f36
+	ldd		[%o0 + 0x88], %f38
+	ldd		[%o0 + 0x90], %f40
+	ldd		[%o0 + 0x98], %f42
+	ldd		[%o0 + 0xa0], %f44
+	ldd		[%o0 + 0xa8], %f46
+	ldd		[%o0 + 0xb0], %f48
+	ldd		[%o0 + 0xb8], %f50
+	ldd		[%o0 + 0xc0], %f52
+	retl
+	 ldd		[%o0 + 0xc8], %f54
+ENDPROC(aes_sparc64_load_encrypt_keys_192)
+
+	.align		32
+ENTRY(aes_sparc64_load_encrypt_keys_256)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x10], %f8
+	ldd		[%o0 + 0x18], %f10
+	ldd		[%o0 + 0x20], %f12
+	ldd		[%o0 + 0x28], %f14
+	ldd		[%o0 + 0x30], %f16
+	ldd		[%o0 + 0x38], %f18
+	ldd		[%o0 + 0x40], %f20
+	ldd		[%o0 + 0x48], %f22
+	ldd		[%o0 + 0x50], %f24
+	ldd		[%o0 + 0x58], %f26
+	ldd		[%o0 + 0x60], %f28
+	ldd		[%o0 + 0x68], %f30
+	ldd		[%o0 + 0x70], %f32
+	ldd		[%o0 + 0x78], %f34
+	ldd		[%o0 + 0x80], %f36
+	ldd		[%o0 + 0x88], %f38
+	ldd		[%o0 + 0x90], %f40
+	ldd		[%o0 + 0x98], %f42
+	ldd		[%o0 + 0xa0], %f44
+	ldd		[%o0 + 0xa8], %f46
+	ldd		[%o0 + 0xb0], %f48
+	ldd		[%o0 + 0xb8], %f50
+	ldd		[%o0 + 0xc0], %f52
+	ldd		[%o0 + 0xc8], %f54
+	ldd		[%o0 + 0xd0], %f56
+	ldd		[%o0 + 0xd8], %f58
+	ldd		[%o0 + 0xe0], %f60
+	retl
+	 ldd		[%o0 + 0xe8], %f62
+ENDPROC(aes_sparc64_load_encrypt_keys_256)
+
+	.align		32
+ENTRY(aes_sparc64_load_decrypt_keys_128)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0x98], %f8
+	ldd		[%o0 + 0x90], %f10
+	ldd		[%o0 + 0x88], %f12
+	ldd		[%o0 + 0x80], %f14
+	ldd		[%o0 + 0x78], %f16
+	ldd		[%o0 + 0x70], %f18
+	ldd		[%o0 + 0x68], %f20
+	ldd		[%o0 + 0x60], %f22
+	ldd		[%o0 + 0x58], %f24
+	ldd		[%o0 + 0x50], %f26
+	ldd		[%o0 + 0x48], %f28
+	ldd		[%o0 + 0x40], %f30
+	ldd		[%o0 + 0x38], %f32
+	ldd		[%o0 + 0x30], %f34
+	ldd		[%o0 + 0x28], %f36
+	ldd		[%o0 + 0x20], %f38
+	ldd		[%o0 + 0x18], %f40
+	ldd		[%o0 + 0x10], %f42
+	ldd		[%o0 + 0x08], %f44
+	retl
+	 ldd		[%o0 + 0x00], %f46
+ENDPROC(aes_sparc64_load_decrypt_keys_128)
+
+	.align		32
+ENTRY(aes_sparc64_load_decrypt_keys_192)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0xb8], %f8
+	ldd		[%o0 + 0xb0], %f10
+	ldd		[%o0 + 0xa8], %f12
+	ldd		[%o0 + 0xa0], %f14
+	ldd		[%o0 + 0x98], %f16
+	ldd		[%o0 + 0x90], %f18
+	ldd		[%o0 + 0x88], %f20
+	ldd		[%o0 + 0x80], %f22
+	ldd		[%o0 + 0x78], %f24
+	ldd		[%o0 + 0x70], %f26
+	ldd		[%o0 + 0x68], %f28
+	ldd		[%o0 + 0x60], %f30
+	ldd		[%o0 + 0x58], %f32
+	ldd		[%o0 + 0x50], %f34
+	ldd		[%o0 + 0x48], %f36
+	ldd		[%o0 + 0x40], %f38
+	ldd		[%o0 + 0x38], %f40
+	ldd		[%o0 + 0x30], %f42
+	ldd		[%o0 + 0x28], %f44
+	ldd		[%o0 + 0x20], %f46
+	ldd		[%o0 + 0x18], %f48
+	ldd		[%o0 + 0x10], %f50
+	ldd		[%o0 + 0x08], %f52
+	retl
+	 ldd		[%o0 + 0x00], %f54
+ENDPROC(aes_sparc64_load_decrypt_keys_192)
+
+	.align		32
+ENTRY(aes_sparc64_load_decrypt_keys_256)
+	/* %o0=key */
+	VISEntry
+	ldd		[%o0 + 0xd8], %f8
+	ldd		[%o0 + 0xd0], %f10
+	ldd		[%o0 + 0xc8], %f12
+	ldd		[%o0 + 0xc0], %f14
+	ldd		[%o0 + 0xb8], %f16
+	ldd		[%o0 + 0xb0], %f18
+	ldd		[%o0 + 0xa8], %f20
+	ldd		[%o0 + 0xa0], %f22
+	ldd		[%o0 + 0x98], %f24
+	ldd		[%o0 + 0x90], %f26
+	ldd		[%o0 + 0x88], %f28
+	ldd		[%o0 + 0x80], %f30
+	ldd		[%o0 + 0x78], %f32
+	ldd		[%o0 + 0x70], %f34
+	ldd		[%o0 + 0x68], %f36
+	ldd		[%o0 + 0x60], %f38
+	ldd		[%o0 + 0x58], %f40
+	ldd		[%o0 + 0x50], %f42
+	ldd		[%o0 + 0x48], %f44
+	ldd		[%o0 + 0x40], %f46
+	ldd		[%o0 + 0x38], %f48
+	ldd		[%o0 + 0x30], %f50
+	ldd		[%o0 + 0x28], %f52
+	ldd		[%o0 + 0x20], %f54
+	ldd		[%o0 + 0x18], %f56
+	ldd		[%o0 + 0x10], %f58
+	ldd		[%o0 + 0x08], %f60
+	retl
+	 ldd		[%o0 + 0x00], %f62
+ENDPROC(aes_sparc64_load_decrypt_keys_256)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_encrypt_128)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 + 0x00], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	ENCRYPT_128_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	ENCRYPT_128(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_encrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_encrypt_192)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 + 0x00], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	ENCRYPT_192_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	ENCRYPT_192(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_encrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_encrypt_256)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 + 0x00], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	ENCRYPT_256_2(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f0, [%o2 + 0x10]
+	std		%f2, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	ENCRYPT_256(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_encrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_decrypt_128)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 - 0x10], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 - 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	DECRYPT_128_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz,pt		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_128(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_decrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_decrypt_192)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 - 0x10], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 - 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F60
+	MOVXTOD_G7_F62
+	DECRYPT_192_2(8, 4, 6, 60, 62, 0, 2, 56, 58)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz,pt		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_192(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_decrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_ecb_decrypt_256)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len */
+	ldx		[%o0 - 0x10], %g1
+	subcc		%o3, 0x10, %o3
+	be		10f
+	 ldx		[%o0 - 0x08], %g2
+	sub		%o0, 0xf0, %o0
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	xor		%g1, %o4, %g3
+	xor		%g2, %o5, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	DECRYPT_256_2(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	std		%f0, [%o2 + 0x10]
+	std		%f2, [%o2 + 0x18]
+	sub		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz,pt		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_256(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	retl
+	 nop
+ENDPROC(aes_sparc64_ecb_decrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_encrypt_128)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd		[%o4 + 0x00], %f4
+	ldd		[%o4 + 0x08], %f6
+	ldx		[%o0 + 0x00], %g1
+	ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	ENCRYPT_128(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	std		%f4, [%o4 + 0x00]
+	std		%f6, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_encrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_encrypt_192)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd		[%o4 + 0x00], %f4
+	ldd		[%o4 + 0x08], %f6
+	ldx		[%o0 + 0x00], %g1
+	ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	ENCRYPT_192(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	std		%f4, [%o4 + 0x00]
+	std		%f6, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_encrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_encrypt_256)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd		[%o4 + 0x00], %f4
+	ldd		[%o4 + 0x08], %f6
+	ldx		[%o0 + 0x00], %g1
+	ldx		[%o0 + 0x08], %g2
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F0
+	MOVXTOD_G7_F2
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	ENCRYPT_256(8, 4, 6, 0, 2)
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	std		%f4, [%o4 + 0x00]
+	std		%f6, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_encrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_decrypt_128)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */
+	ldx		[%o0 - 0x10], %g1
+	ldx		[%o0 - 0x08], %g2
+	ldx		[%o4 + 0x00], %o0
+	ldx		[%o4 + 0x08], %o5
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_128(8, 4, 6, 0, 2)
+	MOVXTOD_O0_F0
+	MOVXTOD_O5_F2
+	xor		%g1, %g3, %o0
+	xor		%g2, %g7, %o5
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	stx		%o0, [%o4 + 0x00]
+	stx		%o5, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_decrypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_decrypt_192)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */
+	ldx		[%o0 - 0x10], %g1
+	ldx		[%o0 - 0x08], %g2
+	ldx		[%o4 + 0x00], %o0
+	ldx		[%o4 + 0x08], %o5
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_192(8, 4, 6, 0, 2)
+	MOVXTOD_O0_F0
+	MOVXTOD_O5_F2
+	xor		%g1, %g3, %o0
+	xor		%g2, %g7, %o5
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	stx		%o0, [%o4 + 0x00]
+	stx		%o5, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_decrypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_cbc_decrypt_256)
+	/* %o0=&key[key_len], %o1=input, %o2=output, %o3=len, %o4=iv */
+	ldx		[%o0 - 0x10], %g1
+	ldx		[%o0 - 0x08], %g2
+	ldx		[%o4 + 0x00], %o0
+	ldx		[%o4 + 0x08], %o5
+1:	ldx		[%o1 + 0x00], %g3
+	ldx		[%o1 + 0x08], %g7
+	add		%o1, 0x10, %o1
+	xor		%g1, %g3, %g3
+	xor		%g2, %g7, %g7
+	MOVXTOD_G3_F4
+	MOVXTOD_G7_F6
+	DECRYPT_256(8, 4, 6, 0, 2)
+	MOVXTOD_O0_F0
+	MOVXTOD_O5_F2
+	xor		%g1, %g3, %o0
+	xor		%g2, %g7, %o5
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+	subcc		%o3, 0x10, %o3
+	bne,pt		%xcc, 1b
+	 add		%o2, 0x10, %o2
+	stx		%o0, [%o4 + 0x00]
+	stx		%o5, [%o4 + 0x08]
+	retl
+	 nop
+ENDPROC(aes_sparc64_cbc_decrypt_256)
+
+	.align		32
+ENTRY(aes_sparc64_ctr_crypt_128)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldx		[%o4 + 0x00], %g3
+	ldx		[%o4 + 0x08], %g7
+	subcc		%o3, 0x10, %o3
+	ldx		[%o0 + 0x00], %g1
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F4
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F6
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_128_2(8, 0, 2, 4, 6, 56, 58, 60, 62)
+	ldd		[%o1 + 0x00], %f56
+	ldd		[%o1 + 0x08], %f58
+	ldd		[%o1 + 0x10], %f60
+	ldd		[%o1 + 0x18], %f62
+	fxor		%f56, %f0, %f56
+	fxor		%f58, %f2, %f58
+	fxor		%f60, %f4, %f60
+	fxor		%f62, %f6, %f62
+	std		%f56, [%o2 + 0x00]
+	std		%f58, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	subcc		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_128(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f4
+	ldd		[%o1 + 0x08], %f6
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	stx		%g3, [%o4 + 0x00]
+	retl
+	 stx		%g7, [%o4 + 0x08]
+ENDPROC(aes_sparc64_ctr_crypt_128)
+
+	.align		32
+ENTRY(aes_sparc64_ctr_crypt_192)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldx		[%o4 + 0x00], %g3
+	ldx		[%o4 + 0x08], %g7
+	subcc		%o3, 0x10, %o3
+	ldx		[%o0 + 0x00], %g1
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F4
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F6
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_192_2(8, 0, 2, 4, 6, 56, 58, 60, 62)
+	ldd		[%o1 + 0x00], %f56
+	ldd		[%o1 + 0x08], %f58
+	ldd		[%o1 + 0x10], %f60
+	ldd		[%o1 + 0x18], %f62
+	fxor		%f56, %f0, %f56
+	fxor		%f58, %f2, %f58
+	fxor		%f60, %f4, %f60
+	fxor		%f62, %f6, %f62
+	std		%f56, [%o2 + 0x00]
+	std		%f58, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	subcc		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+10:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_192(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f4
+	ldd		[%o1 + 0x08], %f6
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	stx		%g3, [%o4 + 0x00]
+	retl
+	 stx		%g7, [%o4 + 0x08]
+ENDPROC(aes_sparc64_ctr_crypt_192)
+
+	.align		32
+ENTRY(aes_sparc64_ctr_crypt_256)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldx		[%o4 + 0x00], %g3
+	ldx		[%o4 + 0x08], %g7
+	subcc		%o3, 0x10, %o3
+	ldx		[%o0 + 0x00], %g1
+	be		10f
+	 ldx		[%o0 + 0x08], %g2
+1:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F4
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F6
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_256_2(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f56
+	ldd		[%o1 + 0x08], %f58
+	ldd		[%o1 + 0x10], %f60
+	ldd		[%o1 + 0x18], %f62
+	fxor		%f56, %f0, %f56
+	fxor		%f58, %f2, %f58
+	fxor		%f60, %f4, %f60
+	fxor		%f62, %f6, %f62
+	std		%f56, [%o2 + 0x00]
+	std		%f58, [%o2 + 0x08]
+	std		%f60, [%o2 + 0x10]
+	std		%f62, [%o2 + 0x18]
+	subcc		%o3, 0x20, %o3
+	add		%o1, 0x20, %o1
+	brgz		%o3, 1b
+	 add		%o2, 0x20, %o2
+	brlz,pt		%o3, 11f
+	 nop
+	ldd		[%o0 + 0xd0], %f56
+	ldd		[%o0 + 0xd8], %f58
+	ldd		[%o0 + 0xe0], %f60
+	ldd		[%o0 + 0xe8], %f62
+10:	xor		%g1, %g3, %o5
+	MOVXTOD_O5_F0
+	xor		%g2, %g7, %o5
+	MOVXTOD_O5_F2
+	add		%g7, 1, %g7
+	add		%g3, 1, %o5
+	movrz		%g7, %o5, %g3
+	ENCRYPT_256(8, 0, 2, 4, 6)
+	ldd		[%o1 + 0x00], %f4
+	ldd		[%o1 + 0x08], %f6
+	fxor		%f4, %f0, %f4
+	fxor		%f6, %f2, %f6
+	std		%f4, [%o2 + 0x00]
+	std		%f6, [%o2 + 0x08]
+11:	stx		%g3, [%o4 + 0x00]
+	retl
+	 stx		%g7, [%o4 + 0x08]
+ENDPROC(aes_sparc64_ctr_crypt_256)
diff --git a/arch/sparc/crypto/aes_glue.c b/arch/sparc/crypto/aes_glue.c
new file mode 100644
index 000000000000..8f1c9980f637
--- /dev/null
+++ b/arch/sparc/crypto/aes_glue.c
@@ -0,0 +1,477 @@
+/* Glue code for AES encryption optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/aesni-intel_glue.c
+ *
+ * Copyright (C) 2008, Intel Corp.
+ *    Author: Huang Ying <ying.huang@intel.com>
+ *
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ *    Authors: Adrian Hoban <adrian.hoban@intel.com>
+ *             Gabriele Paoloni <gabriele.paoloni@intel.com>
+ *             Tadeusz Struk (tadeusz.struk@intel.com)
+ *             Aidan O'Mahony (aidan.o.mahony@intel.com)
+ *    Copyright (c) 2010, Intel Corporation.
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <crypto/aes.h>
+
+#include <asm/fpumacro.h>
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+struct aes_ops {
+	void (*encrypt)(const u64 *key, const u32 *input, u32 *output);
+	void (*decrypt)(const u64 *key, const u32 *input, u32 *output);
+	void (*load_encrypt_keys)(const u64 *key);
+	void (*load_decrypt_keys)(const u64 *key);
+	void (*ecb_encrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len);
+	void (*ecb_decrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len);
+	void (*cbc_encrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len, u64 *iv);
+	void (*cbc_decrypt)(const u64 *key, const u64 *input, u64 *output,
+			    unsigned int len, u64 *iv);
+	void (*ctr_crypt)(const u64 *key, const u64 *input, u64 *output,
+			  unsigned int len, u64 *iv);
+};
+
+struct crypto_sparc64_aes_ctx {
+	struct aes_ops *ops;
+	u64 key[AES_MAX_KEYLENGTH / sizeof(u64)];
+	u32 key_length;
+	u32 expanded_key_length;
+};
+
+extern void aes_sparc64_encrypt_128(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_encrypt_192(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_encrypt_256(const u64 *key, const u32 *input,
+				    u32 *output);
+
+extern void aes_sparc64_decrypt_128(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_decrypt_192(const u64 *key, const u32 *input,
+				    u32 *output);
+extern void aes_sparc64_decrypt_256(const u64 *key, const u32 *input,
+				    u32 *output);
+
+extern void aes_sparc64_load_encrypt_keys_128(const u64 *key);
+extern void aes_sparc64_load_encrypt_keys_192(const u64 *key);
+extern void aes_sparc64_load_encrypt_keys_256(const u64 *key);
+
+extern void aes_sparc64_load_decrypt_keys_128(const u64 *key);
+extern void aes_sparc64_load_decrypt_keys_192(const u64 *key);
+extern void aes_sparc64_load_decrypt_keys_256(const u64 *key);
+
+extern void aes_sparc64_ecb_encrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_encrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_encrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+
+extern void aes_sparc64_ecb_decrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_decrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+extern void aes_sparc64_ecb_decrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len);
+
+extern void aes_sparc64_cbc_encrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_encrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_encrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_decrypt_128(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_decrypt_192(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_cbc_decrypt_256(const u64 *key, const u64 *input,
+					u64 *output, unsigned int len,
+					u64 *iv);
+
+extern void aes_sparc64_ctr_crypt_128(const u64 *key, const u64 *input,
+				      u64 *output, unsigned int len,
+				      u64 *iv);
+extern void aes_sparc64_ctr_crypt_192(const u64 *key, const u64 *input,
+				      u64 *output, unsigned int len,
+				      u64 *iv);
+extern void aes_sparc64_ctr_crypt_256(const u64 *key, const u64 *input,
+				      u64 *output, unsigned int len,
+				      u64 *iv);
+
+struct aes_ops aes128_ops = {
+	.encrypt		= aes_sparc64_encrypt_128,
+	.decrypt		= aes_sparc64_decrypt_128,
+	.load_encrypt_keys	= aes_sparc64_load_encrypt_keys_128,
+	.load_decrypt_keys	= aes_sparc64_load_decrypt_keys_128,
+	.ecb_encrypt		= aes_sparc64_ecb_encrypt_128,
+	.ecb_decrypt		= aes_sparc64_ecb_decrypt_128,
+	.cbc_encrypt		= aes_sparc64_cbc_encrypt_128,
+	.cbc_decrypt		= aes_sparc64_cbc_decrypt_128,
+	.ctr_crypt		= aes_sparc64_ctr_crypt_128,
+};
+
+struct aes_ops aes192_ops = {
+	.encrypt		= aes_sparc64_encrypt_192,
+	.decrypt		= aes_sparc64_decrypt_192,
+	.load_encrypt_keys	= aes_sparc64_load_encrypt_keys_192,
+	.load_decrypt_keys	= aes_sparc64_load_decrypt_keys_192,
+	.ecb_encrypt		= aes_sparc64_ecb_encrypt_192,
+	.ecb_decrypt		= aes_sparc64_ecb_decrypt_192,
+	.cbc_encrypt		= aes_sparc64_cbc_encrypt_192,
+	.cbc_decrypt		= aes_sparc64_cbc_decrypt_192,
+	.ctr_crypt		= aes_sparc64_ctr_crypt_192,
+};
+
+struct aes_ops aes256_ops = {
+	.encrypt		= aes_sparc64_encrypt_256,
+	.decrypt		= aes_sparc64_decrypt_256,
+	.load_encrypt_keys	= aes_sparc64_load_encrypt_keys_256,
+	.load_decrypt_keys	= aes_sparc64_load_decrypt_keys_256,
+	.ecb_encrypt		= aes_sparc64_ecb_encrypt_256,
+	.ecb_decrypt		= aes_sparc64_ecb_decrypt_256,
+	.cbc_encrypt		= aes_sparc64_cbc_encrypt_256,
+	.cbc_decrypt		= aes_sparc64_cbc_decrypt_256,
+	.ctr_crypt		= aes_sparc64_ctr_crypt_256,
+};
+
+extern void aes_sparc64_key_expand(const u32 *in_key, u64 *output_key,
+				   unsigned int key_len);
+
+static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
+		       unsigned int key_len)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+
+	switch (key_len) {
+	case AES_KEYSIZE_128:
+		ctx->expanded_key_length = 0xb0;
+		ctx->ops = &aes128_ops;
+		break;
+
+	case AES_KEYSIZE_192:
+		ctx->expanded_key_length = 0xd0;
+		ctx->ops = &aes192_ops;
+		break;
+
+	case AES_KEYSIZE_256:
+		ctx->expanded_key_length = 0xf0;
+		ctx->ops = &aes256_ops;
+		break;
+
+	default:
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	aes_sparc64_key_expand((const u32 *)in_key, &ctx->key[0], key_len);
+	ctx->key_length = key_len;
+
+	return 0;
+}
+
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->ops->encrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst);
+}
+
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	ctx->ops->decrypt(&ctx->key[0], (const u32 *) src, (u32 *) dst);
+}
+
+#define AES_BLOCK_MASK	(~(AES_BLOCK_SIZE-1))
+
+static int ecb_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_encrypt_keys(&ctx->key[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->ecb_encrypt(&ctx->key[0],
+					      (const u64 *)walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	u64 *key_end;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_decrypt_keys(&ctx->key[0]);
+	key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)];
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->ecb_decrypt(key_end,
+					      (const u64 *) walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr, block_len);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+
+	return err;
+}
+
+static int cbc_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_encrypt_keys(&ctx->key[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->cbc_encrypt(&ctx->key[0],
+					      (const u64 *)walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len, (u64 *) walk.iv);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	u64 *key_end;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_decrypt_keys(&ctx->key[0]);
+	key_end = &ctx->key[ctx->expanded_key_length / sizeof(u64)];
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->cbc_decrypt(key_end,
+					      (const u64 *) walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len, (u64 *) walk.iv);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+
+	return err;
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc,
+		     struct scatterlist *dst, struct scatterlist *src,
+		     unsigned int nbytes)
+{
+	struct crypto_sparc64_aes_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	ctx->ops->load_encrypt_keys(&ctx->key[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & AES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			ctx->ops->ctr_crypt(&ctx->key[0],
+					    (const u64 *)walk.src.virt.addr,
+					    (u64 *) walk.dst.virt.addr,
+					    block_len, (u64 *) walk.iv);
+		}
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static struct crypto_alg algs[] = { {
+	.cra_name		= "aes",
+	.cra_driver_name	= "aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 3,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= AES_MIN_KEY_SIZE,
+			.cia_max_keysize	= AES_MAX_KEY_SIZE,
+			.cia_setkey		= aes_set_key,
+			.cia_encrypt		= aes_encrypt,
+			.cia_decrypt		= aes_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(aes)",
+	.cra_driver_name	= "ecb-aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(aes)",
+	.cra_driver_name	= "cbc-aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "ctr(aes)",
+	.cra_driver_name	= "ctr-aes-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= AES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct crypto_sparc64_aes_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+} };
+
+static bool __init sparc64_has_aes_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_AES))
+		return false;
+
+	return true;
+}
+
+static int __init aes_sparc64_mod_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++)
+		INIT_LIST_HEAD(&algs[i].cra_list);
+
+	if (sparc64_has_aes_opcode()) {
+		pr_info("Using sparc64 aes opcodes optimized AES implementation\n");
+		return crypto_register_algs(algs, ARRAY_SIZE(algs));
+	}
+	pr_info("sparc64 aes opcodes not available.\n");
+	return -ENODEV;
+}
+
+static void __exit aes_sparc64_mod_fini(void)
+{
+	crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(aes_sparc64_mod_init);
+module_exit(aes_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("AES Secure Hash Algorithm, sparc64 aes opcode accelerated");
+
+MODULE_ALIAS("aes");
diff --git a/arch/sparc/crypto/camellia_asm.S b/arch/sparc/crypto/camellia_asm.S
new file mode 100644
index 000000000000..cc39553a4e43
--- /dev/null
+++ b/arch/sparc/crypto/camellia_asm.S
@@ -0,0 +1,563 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+#define CAMELLIA_6ROUNDS(KEY_BASE, I0, I1) \
+	CAMELLIA_F(KEY_BASE +  0, I1, I0, I1) \
+	CAMELLIA_F(KEY_BASE +  2, I0, I1, I0) \
+	CAMELLIA_F(KEY_BASE +  4, I1, I0, I1) \
+	CAMELLIA_F(KEY_BASE +  6, I0, I1, I0) \
+	CAMELLIA_F(KEY_BASE +  8, I1, I0, I1) \
+	CAMELLIA_F(KEY_BASE + 10, I0, I1, I0)
+
+#define CAMELLIA_6ROUNDS_FL_FLI(KEY_BASE, I0, I1) \
+	CAMELLIA_6ROUNDS(KEY_BASE, I0, I1) \
+	CAMELLIA_FL(KEY_BASE + 12, I0, I0) \
+	CAMELLIA_FLI(KEY_BASE + 14, I1, I1)
+
+	.data
+
+	.align	8
+SIGMA:	.xword	0xA09E667F3BCC908B
+	.xword	0xB67AE8584CAA73B2
+	.xword	0xC6EF372FE94F82BE
+	.xword	0x54FF53A5F1D36F1C
+	.xword	0x10E527FADE682D1D
+	.xword	0xB05688C2B3E6C1FD
+
+	.text
+
+	.align	32
+ENTRY(camellia_sparc64_key_expand)
+	/* %o0=in_key, %o1=encrypt_key, %o2=key_len, %o3=decrypt_key */
+	VISEntry
+	ld	[%o0 + 0x00], %f0	! i0, k[0]
+	ld	[%o0 + 0x04], %f1	! i1, k[1]
+	ld	[%o0 + 0x08], %f2	! i2, k[2]
+	ld	[%o0 + 0x0c], %f3	! i3, k[3]
+	std	%f0, [%o1 + 0x00]	! k[0, 1]
+	fsrc2	%f0, %f28
+	std	%f2, [%o1 + 0x08]	! k[2, 3]
+	cmp	%o2, 16
+	be	10f
+	 fsrc2	%f2, %f30
+
+	ld	[%o0 + 0x10], %f0
+	ld	[%o0 + 0x14], %f1
+	std	%f0, [%o1 + 0x20]	! k[8, 9]
+	cmp	%o2, 24
+	fone	%f10
+	be,a	1f
+	 fxor	%f10, %f0, %f2
+	ld	[%o0 + 0x18], %f2
+	ld	[%o0 + 0x1c], %f3
+1:
+	std	%f2, [%o1 + 0x28]	! k[10, 11]
+	fxor	%f28, %f0, %f0
+	fxor	%f30, %f2, %f2
+
+10:
+	sethi	%hi(SIGMA), %g3
+	or	%g3, %lo(SIGMA), %g3
+	ldd	[%g3 + 0x00], %f16
+	ldd	[%g3 + 0x08], %f18
+	ldd	[%g3 + 0x10], %f20
+	ldd	[%g3 + 0x18], %f22
+	ldd	[%g3 + 0x20], %f24
+	ldd	[%g3 + 0x28], %f26
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	fxor	%f28, %f0, %f0
+	fxor	%f30, %f2, %f2
+	CAMELLIA_F(20, 2, 0, 2)
+	CAMELLIA_F(22, 0, 2, 0)
+
+#define ROTL128(S01, S23, TMP1, TMP2, N)	\
+	srlx	S01, (64 - N), TMP1;		\
+	sllx	S01, N, S01;			\
+	srlx	S23, (64 - N), TMP2;		\
+	sllx	S23, N, S23;			\
+	or	S01, TMP2, S01;			\
+	or	S23, TMP1, S23
+
+	cmp	%o2, 16
+	bne	1f
+	 nop
+	/* 128-bit key */
+	std	%f0, [%o1 + 0x10]	! k[ 4,  5]
+	std	%f2, [%o1 + 0x18]	! k[ 6,  7]
+	MOVDTOX_F0_O4
+	MOVDTOX_F2_O5
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x30]	! k[12, 13]
+	stx	%o5, [%o1 + 0x38]	! k[14, 15]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x40]	! k[16, 17]
+	stx	%o5, [%o1 + 0x48]	! k[18, 19]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x60]	! k[24, 25]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x70]	! k[28, 29]
+	stx	%o5, [%o1 + 0x78]	! k[30, 31]
+	ROTL128(%o4, %o5, %g2, %g3, 34)
+	stx	%o4, [%o1 + 0xa0]	! k[40, 41]
+	stx	%o5, [%o1 + 0xa8]	! k[42, 43]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0xc0]	! k[48, 49]
+	stx	%o5, [%o1 + 0xc8]	! k[50, 51]
+
+	ldx	[%o1 + 0x00], %o4	! k[ 0,  1]
+	ldx	[%o1 + 0x08], %o5	! k[ 2,  3]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x20]	! k[ 8,  9]
+	stx	%o5, [%o1 + 0x28]	! k[10, 11]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x50]	! k[20, 21]
+	stx	%o5, [%o1 + 0x58]	! k[22, 23]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o5, [%o1 + 0x68]	! k[26, 27]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0x80]	! k[32, 33]
+	stx	%o5, [%o1 + 0x88]	! k[34, 35]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0x90]	! k[36, 37]
+	stx	%o5, [%o1 + 0x98]	! k[38, 39]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0xb0]	! k[44, 45]
+	stx	%o5, [%o1 + 0xb8]	! k[46, 47]
+
+	ba,pt	%xcc, 2f
+	 mov	(3 * 16 * 4), %o0
+
+1:
+	/* 192-bit or 256-bit key */
+	std	%f0, [%o1 + 0x30]	! k[12, 13]
+	std	%f2, [%o1 + 0x38]	! k[14, 15]
+	ldd	[%o1 + 0x20], %f4	! k[ 8,  9]
+	ldd	[%o1 + 0x28], %f6	! k[10, 11]
+	fxor	%f0, %f4, %f0
+	fxor	%f2, %f6, %f2
+	CAMELLIA_F(24, 2, 0, 2)
+	CAMELLIA_F(26, 0, 2, 0)
+	std	%f0, [%o1 + 0x10]	! k[ 4,  5]
+	std	%f2, [%o1 + 0x18]	! k[ 6,  7]
+	MOVDTOX_F0_O4
+	MOVDTOX_F2_O5
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x50]	! k[20, 21]
+	stx	%o5, [%o1 + 0x58]	! k[22, 23]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0xa0]	! k[40, 41]
+	stx	%o5, [%o1 + 0xa8]	! k[42, 43]
+	ROTL128(%o4, %o5, %g2, %g3, 51)
+	stx	%o4, [%o1 + 0x100]	! k[64, 65]
+	stx	%o5, [%o1 + 0x108]	! k[66, 67]
+	ldx	[%o1 + 0x20], %o4	! k[ 8,  9]
+	ldx	[%o1 + 0x28], %o5	! k[10, 11]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x20]	! k[ 8,  9]
+	stx	%o5, [%o1 + 0x28]	! k[10, 11]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x40]	! k[16, 17]
+	stx	%o5, [%o1 + 0x48]	! k[18, 19]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x90]	! k[36, 37]
+	stx	%o5, [%o1 + 0x98]	! k[38, 39]
+	ROTL128(%o4, %o5, %g2, %g3, 34)
+	stx	%o4, [%o1 + 0xd0]	! k[52, 53]
+	stx	%o5, [%o1 + 0xd8]	! k[54, 55]
+	ldx	[%o1 + 0x30], %o4	! k[12, 13]
+	ldx	[%o1 + 0x38], %o5	! k[14, 15]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x30]	! k[12, 13]
+	stx	%o5, [%o1 + 0x38]	! k[14, 15]
+	ROTL128(%o4, %o5, %g2, %g3, 30)
+	stx	%o4, [%o1 + 0x70]	! k[28, 29]
+	stx	%o5, [%o1 + 0x78]	! k[30, 31]
+	srlx	%o4, 32, %g2
+	srlx	%o5, 32, %g3
+	stw	%o4, [%o1 + 0xc0]	! k[48]
+	stw	%g3, [%o1 + 0xc4]	! k[49]
+	stw	%o5, [%o1 + 0xc8]	! k[50]
+	stw	%g2, [%o1 + 0xcc]	! k[51]
+	ROTL128(%o4, %o5, %g2, %g3, 49)
+	stx	%o4, [%o1 + 0xe0]	! k[56, 57]
+	stx	%o5, [%o1 + 0xe8]	! k[58, 59]
+	ldx	[%o1 + 0x00], %o4	! k[ 0,  1]
+	ldx	[%o1 + 0x08], %o5	! k[ 2,  3]
+	ROTL128(%o4, %o5, %g2, %g3, 45)
+	stx	%o4, [%o1 + 0x60]	! k[24, 25]
+	stx	%o5, [%o1 + 0x68]	! k[26, 27]
+	ROTL128(%o4, %o5, %g2, %g3, 15)
+	stx	%o4, [%o1 + 0x80]	! k[32, 33]
+	stx	%o5, [%o1 + 0x88]	! k[34, 35]
+	ROTL128(%o4, %o5, %g2, %g3, 17)
+	stx	%o4, [%o1 + 0xb0]	! k[44, 45]
+	stx	%o5, [%o1 + 0xb8]	! k[46, 47]
+	ROTL128(%o4, %o5, %g2, %g3, 34)
+	stx	%o4, [%o1 + 0xf0]	! k[60, 61]
+	stx	%o5, [%o1 + 0xf8]	! k[62, 63]
+	mov	(4 * 16 * 4), %o0
+2:
+	add	%o1, %o0, %o1
+	ldd	[%o1 + 0x00], %f0
+	ldd	[%o1 + 0x08], %f2
+	std	%f0, [%o3 + 0x00]
+	std	%f2, [%o3 + 0x08]
+	add	%o3, 0x10, %o3
+1:
+	sub	%o1, (16 * 4), %o1
+	ldd	[%o1 + 0x38], %f0
+	ldd	[%o1 + 0x30], %f2
+	ldd	[%o1 + 0x28], %f4
+	ldd	[%o1 + 0x20], %f6
+	ldd	[%o1 + 0x18], %f8
+	ldd	[%o1 + 0x10], %f10
+	std	%f0, [%o3 + 0x00]
+	std	%f2, [%o3 + 0x08]
+	std	%f4, [%o3 + 0x10]
+	std	%f6, [%o3 + 0x18]
+	std	%f8, [%o3 + 0x20]
+	std	%f10, [%o3 + 0x28]
+
+	ldd	[%o1 + 0x08], %f0
+	ldd	[%o1 + 0x00], %f2
+	std	%f0, [%o3 + 0x30]
+	std	%f2, [%o3 + 0x38]
+	subcc	%o0, (16 * 4), %o0
+	bne,pt	%icc, 1b
+	 add	%o3, (16 * 4), %o3
+
+	std	%f2, [%o3 - 0x10]
+	std	%f0, [%o3 - 0x08]
+
+	retl
+	 VISExit
+ENDPROC(camellia_sparc64_key_expand)
+
+	.align	32
+ENTRY(camellia_sparc64_crypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=key_len */
+	VISEntry
+
+	ld	[%o1 + 0x00], %f0
+	ld	[%o1 + 0x04], %f1
+	ld	[%o1 + 0x08], %f2
+	ld	[%o1 + 0x0c], %f3
+
+	ldd	[%o0 + 0x00], %f4
+	ldd	[%o0 + 0x08], %f6
+
+	cmp	%o3, 16
+	fxor	%f4, %f0, %f0
+	be	1f
+	 fxor	%f6, %f2, %f2
+
+	ldd	[%o0 + 0x10], %f8
+	ldd	[%o0 + 0x18], %f10
+	ldd	[%o0 + 0x20], %f12
+	ldd	[%o0 + 0x28], %f14
+	ldd	[%o0 + 0x30], %f16
+	ldd	[%o0 + 0x38], %f18
+	ldd	[%o0 + 0x40], %f20
+	ldd	[%o0 + 0x48], %f22
+	add	%o0, 0x40, %o0
+
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+
+1:
+	ldd	[%o0 + 0x10], %f8
+	ldd	[%o0 + 0x18], %f10
+	ldd	[%o0 + 0x20], %f12
+	ldd	[%o0 + 0x28], %f14
+	ldd	[%o0 + 0x30], %f16
+	ldd	[%o0 + 0x38], %f18
+	ldd	[%o0 + 0x40], %f20
+	ldd	[%o0 + 0x48], %f22
+	ldd	[%o0 + 0x50], %f24
+	ldd	[%o0 + 0x58], %f26
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f32
+	ldd	[%o0 + 0x78], %f34
+	ldd	[%o0 + 0x80], %f36
+	ldd	[%o0 + 0x88], %f38
+	ldd	[%o0 + 0x90], %f40
+	ldd	[%o0 + 0x98], %f42
+	ldd	[%o0 + 0xa0], %f44
+	ldd	[%o0 + 0xa8], %f46
+	ldd	[%o0 + 0xb0], %f48
+	ldd	[%o0 + 0xb8], %f50
+	ldd	[%o0 + 0xc0], %f52
+	ldd	[%o0 + 0xc8], %f54
+
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f2
+	fxor	%f54, %f0, %f0
+
+	st	%f2, [%o2 + 0x00]
+	st	%f3, [%o2 + 0x04]
+	st	%f0, [%o2 + 0x08]
+	st	%f1, [%o2 + 0x0c]
+
+	retl
+	 VISExit
+ENDPROC(camellia_sparc64_crypt)
+
+	.align	32
+ENTRY(camellia_sparc64_load_keys)
+	/* %o0=key, %o1=key_len */
+	VISEntry
+	ldd	[%o0 + 0x00], %f4
+	ldd	[%o0 + 0x08], %f6
+	ldd	[%o0 + 0x10], %f8
+	ldd	[%o0 + 0x18], %f10
+	ldd	[%o0 + 0x20], %f12
+	ldd	[%o0 + 0x28], %f14
+	ldd	[%o0 + 0x30], %f16
+	ldd	[%o0 + 0x38], %f18
+	ldd	[%o0 + 0x40], %f20
+	ldd	[%o0 + 0x48], %f22
+	ldd	[%o0 + 0x50], %f24
+	ldd	[%o0 + 0x58], %f26
+	ldd	[%o0 + 0x60], %f28
+	ldd	[%o0 + 0x68], %f30
+	ldd	[%o0 + 0x70], %f32
+	ldd	[%o0 + 0x78], %f34
+	ldd	[%o0 + 0x80], %f36
+	ldd	[%o0 + 0x88], %f38
+	ldd	[%o0 + 0x90], %f40
+	ldd	[%o0 + 0x98], %f42
+	ldd	[%o0 + 0xa0], %f44
+	ldd	[%o0 + 0xa8], %f46
+	ldd	[%o0 + 0xb0], %f48
+	ldd	[%o0 + 0xb8], %f50
+	ldd	[%o0 + 0xc0], %f52
+	retl
+	 ldd	[%o0 + 0xc8], %f54
+ENDPROC(camellia_sparc64_load_keys)
+
+	.align	32
+ENTRY(camellia_sparc64_ecb_crypt_3_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key */
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f2
+	fxor	%f54, %f0, %f0
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	retl
+	 nop
+ENDPROC(camellia_sparc64_ecb_crypt_3_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_ecb_crypt_4_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key */
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	ldd	[%o3 + 0xd0], %f8
+	ldd	[%o3 + 0xd8], %f10
+	ldd	[%o3 + 0xe0], %f12
+	ldd	[%o3 + 0xe8], %f14
+	ldd	[%o3 + 0xf0], %f16
+	ldd	[%o3 + 0xf8], %f18
+	ldd	[%o3 + 0x100], %f20
+	ldd	[%o3 + 0x108], %f22
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2)
+	CAMELLIA_F(8, 2, 0, 2)
+	CAMELLIA_F(10, 0, 2, 0)
+	ldd	[%o3 + 0x10], %f8
+	ldd	[%o3 + 0x18], %f10
+	CAMELLIA_F(12, 2, 0, 2)
+	CAMELLIA_F(14, 0, 2, 0)
+	ldd	[%o3 + 0x20], %f12
+	ldd	[%o3 + 0x28], %f14
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	ldd	[%o3 + 0x30], %f16
+	ldd	[%o3 + 0x38], %f18
+	fxor	%f20, %f2, %f2
+	fxor	%f22, %f0, %f0
+	ldd	[%o3 + 0x40], %f20
+	ldd	[%o3 + 0x48], %f22
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	retl
+	 nop
+ENDPROC(camellia_sparc64_ecb_crypt_4_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_encrypt_3_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f60, %f0, %f0
+	fxor	%f62, %f2, %f2
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f60
+	fxor	%f54, %f0, %f62
+	std	%f60, [%o1 + 0x00]
+	std	%f62, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_encrypt_3_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_encrypt_4_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	add	%o0, 0x10, %o0
+	fxor	%f60, %f0, %f0
+	fxor	%f62, %f2, %f2
+	fxor	%f4, %f0, %f0
+	fxor	%f6, %f2, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	ldd	[%o3 + 0xd0], %f8
+	ldd	[%o3 + 0xd8], %f10
+	ldd	[%o3 + 0xe0], %f12
+	ldd	[%o3 + 0xe8], %f14
+	ldd	[%o3 + 0xf0], %f16
+	ldd	[%o3 + 0xf8], %f18
+	ldd	[%o3 + 0x100], %f20
+	ldd	[%o3 + 0x108], %f22
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2)
+	CAMELLIA_F(8, 2, 0, 2)
+	CAMELLIA_F(10, 0, 2, 0)
+	ldd	[%o3 + 0x10], %f8
+	ldd	[%o3 + 0x18], %f10
+	CAMELLIA_F(12, 2, 0, 2)
+	CAMELLIA_F(14, 0, 2, 0)
+	ldd	[%o3 + 0x20], %f12
+	ldd	[%o3 + 0x28], %f14
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	ldd	[%o3 + 0x30], %f16
+	ldd	[%o3 + 0x38], %f18
+	fxor	%f20, %f2, %f60
+	fxor	%f22, %f0, %f62
+	ldd	[%o3 + 0x40], %f20
+	ldd	[%o3 + 0x48], %f22
+	std	%f60, [%o1 + 0x00]
+	std	%f62, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_encrypt_4_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_decrypt_3_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f56
+	ldd	[%o0 + 0x08], %f58
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f56, %f0
+	fxor	%f6, %f58, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS(40, 0, 2)
+	fxor	%f52, %f2, %f2
+	fxor	%f54, %f0, %f0
+	fxor	%f60, %f2, %f2
+	fxor	%f62, %f0, %f0
+	fsrc2	%f56, %f60
+	fsrc2	%f58, %f62
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_decrypt_3_grand_rounds)
+
+	.align	32
+ENTRY(camellia_sparc64_cbc_decrypt_4_grand_rounds)
+	/* %o0=input, %o1=output, %o2=len, %o3=key, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+	ldd	[%o4 + 0x08], %f62
+1:	ldd	[%o0 + 0x00], %f56
+	ldd	[%o0 + 0x08], %f58
+	add	%o0, 0x10, %o0
+	fxor	%f4, %f56, %f0
+	fxor	%f6, %f58, %f2
+	CAMELLIA_6ROUNDS_FL_FLI( 8, 0, 2)
+	ldd	[%o3 + 0xd0], %f8
+	ldd	[%o3 + 0xd8], %f10
+	ldd	[%o3 + 0xe0], %f12
+	ldd	[%o3 + 0xe8], %f14
+	ldd	[%o3 + 0xf0], %f16
+	ldd	[%o3 + 0xf8], %f18
+	ldd	[%o3 + 0x100], %f20
+	ldd	[%o3 + 0x108], %f22
+	CAMELLIA_6ROUNDS_FL_FLI(24, 0, 2)
+	CAMELLIA_6ROUNDS_FL_FLI(40, 0, 2)
+	CAMELLIA_F(8, 2, 0, 2)
+	CAMELLIA_F(10, 0, 2, 0)
+	ldd	[%o3 + 0x10], %f8
+	ldd	[%o3 + 0x18], %f10
+	CAMELLIA_F(12, 2, 0, 2)
+	CAMELLIA_F(14, 0, 2, 0)
+	ldd	[%o3 + 0x20], %f12
+	ldd	[%o3 + 0x28], %f14
+	CAMELLIA_F(16, 2, 0, 2)
+	CAMELLIA_F(18, 0, 2, 0)
+	ldd	[%o3 + 0x30], %f16
+	ldd	[%o3 + 0x38], %f18
+	fxor	%f20, %f2, %f2
+	fxor	%f22, %f0, %f0
+	ldd	[%o3 + 0x40], %f20
+	ldd	[%o3 + 0x48], %f22
+	fxor	%f60, %f2, %f2
+	fxor	%f62, %f0, %f0
+	fsrc2	%f56, %f60
+	fsrc2	%f58, %f62
+	std	%f2, [%o1 + 0x00]
+	std	%f0, [%o1 + 0x08]
+	subcc	%o2, 0x10, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x10, %o1
+	std	%f60, [%o4 + 0x00]
+	retl
+	 std	%f62, [%o4 + 0x08]
+ENDPROC(camellia_sparc64_cbc_decrypt_4_grand_rounds)
diff --git a/arch/sparc/crypto/camellia_glue.c b/arch/sparc/crypto/camellia_glue.c
new file mode 100644
index 000000000000..42905c084299
--- /dev/null
+++ b/arch/sparc/crypto/camellia_glue.c
@@ -0,0 +1,322 @@
+/* Glue code for CAMELLIA encryption optimized for sparc64 crypto opcodes.
+ *
+ * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+
+#include <asm/fpumacro.h>
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+#define CAMELLIA_MIN_KEY_SIZE        16
+#define CAMELLIA_MAX_KEY_SIZE        32
+#define CAMELLIA_BLOCK_SIZE          16
+#define CAMELLIA_TABLE_BYTE_LEN     272
+
+struct camellia_sparc64_ctx {
+	u64 encrypt_key[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+	u64 decrypt_key[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
+	int key_len;
+};
+
+extern void camellia_sparc64_key_expand(const u32 *in_key, u64 *encrypt_key,
+					unsigned int key_len, u64 *decrypt_key);
+
+static int camellia_set_key(struct crypto_tfm *tfm, const u8 *_in_key,
+			    unsigned int key_len)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u32 *in_key = (const u32 *) _in_key;
+	u32 *flags = &tfm->crt_flags;
+
+	if (key_len != 16 && key_len != 24 && key_len != 32) {
+		*flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+		return -EINVAL;
+	}
+
+	ctx->key_len = key_len;
+
+	camellia_sparc64_key_expand(in_key, &ctx->encrypt_key[0],
+				    key_len, &ctx->decrypt_key[0]);
+	return 0;
+}
+
+extern void camellia_sparc64_crypt(const u64 *key, const u32 *input,
+				   u32 *output, unsigned int key_len);
+
+static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	camellia_sparc64_crypt(&ctx->encrypt_key[0],
+			       (const u32 *) src,
+			       (u32 *) dst, ctx->key_len);
+}
+
+static void camellia_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+
+	camellia_sparc64_crypt(&ctx->decrypt_key[0],
+			       (const u32 *) src,
+			       (u32 *) dst, ctx->key_len);
+}
+
+extern void camellia_sparc64_load_keys(const u64 *key, unsigned int key_len);
+
+typedef void ecb_crypt_op(const u64 *input, u64 *output, unsigned int len,
+			  const u64 *key);
+
+extern ecb_crypt_op camellia_sparc64_ecb_crypt_3_grand_rounds;
+extern ecb_crypt_op camellia_sparc64_ecb_crypt_4_grand_rounds;
+
+#define CAMELLIA_BLOCK_MASK	(~(CAMELLIA_BLOCK_SIZE - 1))
+
+static int __ecb_crypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes, bool encrypt)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	ecb_crypt_op *op;
+	const u64 *key;
+	int err;
+
+	op = camellia_sparc64_ecb_crypt_3_grand_rounds;
+	if (ctx->key_len != 16)
+		op = camellia_sparc64_ecb_crypt_4_grand_rounds;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	if (encrypt)
+		key = &ctx->encrypt_key[0];
+	else
+		key = &ctx->decrypt_key[0];
+	camellia_sparc64_load_keys(key, ctx->key_len);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64;
+			u64 *dst64;
+
+			src64 = (const u64 *)walk.src.virt.addr;
+			dst64 = (u64 *) walk.dst.virt.addr;
+			op(src64, dst64, block_len, key);
+		}
+		nbytes &= CAMELLIA_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, false);
+}
+
+typedef void cbc_crypt_op(const u64 *input, u64 *output, unsigned int len,
+			  const u64 *key, u64 *iv);
+
+extern cbc_crypt_op camellia_sparc64_cbc_encrypt_3_grand_rounds;
+extern cbc_crypt_op camellia_sparc64_cbc_encrypt_4_grand_rounds;
+extern cbc_crypt_op camellia_sparc64_cbc_decrypt_3_grand_rounds;
+extern cbc_crypt_op camellia_sparc64_cbc_decrypt_4_grand_rounds;
+
+static int cbc_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	cbc_crypt_op *op;
+	const u64 *key;
+	int err;
+
+	op = camellia_sparc64_cbc_encrypt_3_grand_rounds;
+	if (ctx->key_len != 16)
+		op = camellia_sparc64_cbc_encrypt_4_grand_rounds;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	key = &ctx->encrypt_key[0];
+	camellia_sparc64_load_keys(key, ctx->key_len);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64;
+			u64 *dst64;
+
+			src64 = (const u64 *)walk.src.virt.addr;
+			dst64 = (u64 *) walk.dst.virt.addr;
+			op(src64, dst64, block_len, key,
+			   (u64 *) walk.iv);
+		}
+		nbytes &= CAMELLIA_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int cbc_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct camellia_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	cbc_crypt_op *op;
+	const u64 *key;
+	int err;
+
+	op = camellia_sparc64_cbc_decrypt_3_grand_rounds;
+	if (ctx->key_len != 16)
+		op = camellia_sparc64_cbc_decrypt_4_grand_rounds;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	key = &ctx->decrypt_key[0];
+	camellia_sparc64_load_keys(key, ctx->key_len);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & CAMELLIA_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64;
+			u64 *dst64;
+
+			src64 = (const u64 *)walk.src.virt.addr;
+			dst64 = (u64 *) walk.dst.virt.addr;
+			op(src64, dst64, block_len, key,
+			   (u64 *) walk.iv);
+		}
+		nbytes &= CAMELLIA_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static struct crypto_alg algs[] = { {
+	.cra_name		= "camellia",
+	.cra_driver_name	= "camellia-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_sparc64_ctx),
+	.cra_alignmask		= 3,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.cia_max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.cia_setkey		= camellia_set_key,
+			.cia_encrypt		= camellia_encrypt,
+			.cia_decrypt		= camellia_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(camellia)",
+	.cra_driver_name	= "ecb-camellia-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(camellia)",
+	.cra_driver_name	= "cbc-camellia-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct camellia_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= CAMELLIA_MIN_KEY_SIZE,
+			.max_keysize	= CAMELLIA_MAX_KEY_SIZE,
+			.setkey		= camellia_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}
+};
+
+static bool __init sparc64_has_camellia_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_CAMELLIA))
+		return false;
+
+	return true;
+}
+
+static int __init camellia_sparc64_mod_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++)
+		INIT_LIST_HEAD(&algs[i].cra_list);
+
+	if (sparc64_has_camellia_opcode()) {
+		pr_info("Using sparc64 camellia opcodes optimized CAMELLIA implementation\n");
+		return crypto_register_algs(algs, ARRAY_SIZE(algs));
+	}
+	pr_info("sparc64 camellia opcodes not available.\n");
+	return -ENODEV;
+}
+
+static void __exit camellia_sparc64_mod_fini(void)
+{
+	crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(camellia_sparc64_mod_init);
+module_exit(camellia_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Camellia Cipher Algorithm, sparc64 camellia opcode accelerated");
+
+MODULE_ALIAS("aes");
diff --git a/arch/sparc/crypto/crc32c_asm.S b/arch/sparc/crypto/crc32c_asm.S
new file mode 100644
index 000000000000..2b1976e765b5
--- /dev/null
+++ b/arch/sparc/crypto/crc32c_asm.S
@@ -0,0 +1,20 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+#include <asm/asi.h>
+
+#include "opcodes.h"
+
+ENTRY(crc32c_sparc64)
+	/* %o0=crc32p, %o1=data_ptr, %o2=len */
+	VISEntryHalf
+	lda	[%o0] ASI_PL, %f1
+1:	ldd	[%o1], %f2
+	CRC32C(0,2,0)
+	subcc	%o2, 8, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x8, %o1
+	sta	%f1, [%o0] ASI_PL
+	VISExitHalf
+2:	retl
+	 nop
+ENDPROC(crc32c_sparc64)
diff --git a/arch/sparc/crypto/crc32c_glue.c b/arch/sparc/crypto/crc32c_glue.c
new file mode 100644
index 000000000000..0bd89cea8d8e
--- /dev/null
+++ b/arch/sparc/crypto/crc32c_glue.c
@@ -0,0 +1,179 @@
+/* Glue code for CRC32C optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/crc32c-intel.c
+ *
+ * Copyright (C) 2008 Intel Corporation
+ * Authors: Austin Zhang <austin_zhang@linux.intel.com>
+ *          Kent Liu <kent.liu@intel.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/crc32.h>
+
+#include <crypto/internal/hash.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+/*
+ * Setting the seed allows arbitrary accumulators and flexible XOR policy
+ * If your algorithm starts with ~0, then XOR with ~0 before you set
+ * the seed.
+ */
+static int crc32c_sparc64_setkey(struct crypto_shash *hash, const u8 *key,
+				 unsigned int keylen)
+{
+	u32 *mctx = crypto_shash_ctx(hash);
+
+	if (keylen != sizeof(u32)) {
+		crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+		return -EINVAL;
+	}
+	*(__le32 *)mctx = le32_to_cpup((__le32 *)key);
+	return 0;
+}
+
+static int crc32c_sparc64_init(struct shash_desc *desc)
+{
+	u32 *mctx = crypto_shash_ctx(desc->tfm);
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*crcp = *mctx;
+
+	return 0;
+}
+
+extern void crc32c_sparc64(u32 *crcp, const u64 *data, unsigned int len);
+
+static void crc32c_compute(u32 *crcp, const u64 *data, unsigned int len)
+{
+	unsigned int asm_len;
+
+	asm_len = len & ~7U;
+	if (asm_len) {
+		crc32c_sparc64(crcp, data, asm_len);
+		data += asm_len / 8;
+		len -= asm_len;
+	}
+	if (len)
+		*crcp = __crc32c_le(*crcp, (const unsigned char *) data, len);
+}
+
+static int crc32c_sparc64_update(struct shash_desc *desc, const u8 *data,
+				 unsigned int len)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	crc32c_compute(crcp, (const u64 *) data, len);
+
+	return 0;
+}
+
+static int __crc32c_sparc64_finup(u32 *crcp, const u8 *data, unsigned int len,
+				  u8 *out)
+{
+	u32 tmp = *crcp;
+
+	crc32c_compute(&tmp, (const u64 *) data, len);
+
+	*(__le32 *) out = ~cpu_to_le32(tmp);
+	return 0;
+}
+
+static int crc32c_sparc64_finup(struct shash_desc *desc, const u8 *data,
+				unsigned int len, u8 *out)
+{
+	return __crc32c_sparc64_finup(shash_desc_ctx(desc), data, len, out);
+}
+
+static int crc32c_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	u32 *crcp = shash_desc_ctx(desc);
+
+	*(__le32 *) out = ~cpu_to_le32p(crcp);
+	return 0;
+}
+
+static int crc32c_sparc64_digest(struct shash_desc *desc, const u8 *data,
+				 unsigned int len, u8 *out)
+{
+	return __crc32c_sparc64_finup(crypto_shash_ctx(desc->tfm), data, len,
+				      out);
+}
+
+static int crc32c_sparc64_cra_init(struct crypto_tfm *tfm)
+{
+	u32 *key = crypto_tfm_ctx(tfm);
+
+	*key = ~0;
+
+	return 0;
+}
+
+#define CHKSUM_BLOCK_SIZE	1
+#define CHKSUM_DIGEST_SIZE	4
+
+static struct shash_alg alg = {
+	.setkey			=	crc32c_sparc64_setkey,
+	.init			=	crc32c_sparc64_init,
+	.update			=	crc32c_sparc64_update,
+	.final			=	crc32c_sparc64_final,
+	.finup			=	crc32c_sparc64_finup,
+	.digest			=	crc32c_sparc64_digest,
+	.descsize		=	sizeof(u32),
+	.digestsize		=	CHKSUM_DIGEST_SIZE,
+	.base			=	{
+		.cra_name		=	"crc32c",
+		.cra_driver_name	=	"crc32c-sparc64",
+		.cra_priority		=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_blocksize		=	CHKSUM_BLOCK_SIZE,
+		.cra_ctxsize		=	sizeof(u32),
+		.cra_alignmask		=	7,
+		.cra_module		=	THIS_MODULE,
+		.cra_init		=	crc32c_sparc64_cra_init,
+	}
+};
+
+static bool __init sparc64_has_crc32c_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_CRC32C))
+		return false;
+
+	return true;
+}
+
+static int __init crc32c_sparc64_mod_init(void)
+{
+	if (sparc64_has_crc32c_opcode()) {
+		pr_info("Using sparc64 crc32c opcode optimized CRC32C implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("sparc64 crc32c opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit crc32c_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(crc32c_sparc64_mod_init);
+module_exit(crc32c_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("CRC32c (Castagnoli), sparc64 crc32c opcode accelerated");
+
+MODULE_ALIAS("crc32c");
diff --git a/arch/sparc/crypto/crop_devid.c b/arch/sparc/crypto/crop_devid.c
new file mode 100644
index 000000000000..5f5724a0ae22
--- /dev/null
+++ b/arch/sparc/crypto/crop_devid.c
@@ -0,0 +1,14 @@
+#include <linux/module.h>
+#include <linux/of_device.h>
+
+/* This is a dummy device table linked into all of the crypto
+ * opcode drivers.  It serves to trigger the module autoloading
+ * mechanisms in userspace which scan the OF device tree and
+ * load any modules which have device table entries that
+ * match OF device nodes.
+ */
+static const struct of_device_id crypto_opcode_match[] = {
+	{ .name = "cpu", .compatible = "sun4v", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, crypto_opcode_match);
diff --git a/arch/sparc/crypto/des_asm.S b/arch/sparc/crypto/des_asm.S
new file mode 100644
index 000000000000..30b6e90b28b2
--- /dev/null
+++ b/arch/sparc/crypto/des_asm.S
@@ -0,0 +1,418 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+	.align	32
+ENTRY(des_sparc64_key_expand)
+	/* %o0=input_key, %o1=output_key */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	DES_KEXPAND(0, 0, 0)
+	DES_KEXPAND(0, 1, 2)
+	DES_KEXPAND(2, 3, 6)
+	DES_KEXPAND(2, 2, 4)
+	DES_KEXPAND(6, 3, 10)
+	DES_KEXPAND(6, 2, 8)
+	DES_KEXPAND(10, 3, 14)
+	DES_KEXPAND(10, 2, 12)
+	DES_KEXPAND(14, 1, 16)
+	DES_KEXPAND(16, 3, 20)
+	DES_KEXPAND(16, 2, 18)
+	DES_KEXPAND(20, 3, 24)
+	DES_KEXPAND(20, 2, 22)
+	DES_KEXPAND(24, 3, 28)
+	DES_KEXPAND(24, 2, 26)
+	DES_KEXPAND(28, 1, 30)
+	std	%f0, [%o1 + 0x00]
+	std	%f2, [%o1 + 0x08]
+	std	%f4, [%o1 + 0x10]
+	std	%f6, [%o1 + 0x18]
+	std	%f8, [%o1 + 0x20]
+	std	%f10, [%o1 + 0x28]
+	std	%f12, [%o1 + 0x30]
+	std	%f14, [%o1 + 0x38]
+	std	%f16, [%o1 + 0x40]
+	std	%f18, [%o1 + 0x48]
+	std	%f20, [%o1 + 0x50]
+	std	%f22, [%o1 + 0x58]
+	std	%f24, [%o1 + 0x60]
+	std	%f26, [%o1 + 0x68]
+	std	%f28, [%o1 + 0x70]
+	std	%f30, [%o1 + 0x78]
+	retl
+	 VISExitHalf
+ENDPROC(des_sparc64_key_expand)
+
+	.align	32
+ENTRY(des_sparc64_crypt)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ldd	[%o1 + 0x00], %f32
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	ldd	[%o0 + 0x78], %f30
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	std	%f32, [%o2 + 0x00]
+	retl
+	 VISExit
+ENDPROC(des_sparc64_crypt)
+
+	.align	32
+ENTRY(des_sparc64_load_keys)
+	/* %o0=key */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	retl
+	 ldd	[%o0 + 0x78], %f30
+ENDPROC(des_sparc64_load_keys)
+
+	.align	32
+ENTRY(des_sparc64_ecb_crypt)
+	/* %o0=input, %o1=output, %o2=len */
+1:	ldd	[%o0 + 0x00], %f32
+	add	%o0, 0x08, %o0
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	std	%f32, [%o1 + 0x00]
+	subcc	%o2, 0x08, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x08, %o1
+	retl
+	 nop
+ENDPROC(des_sparc64_ecb_crypt)
+
+	.align	32
+ENTRY(des_sparc64_cbc_encrypt)
+	/* %o0=input, %o1=output, %o2=len, %o3=IV */
+	ldd	[%o3 + 0x00], %f32
+1:	ldd	[%o0 + 0x00], %f34
+	fxor	%f32, %f34, %f32
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	std	%f32, [%o1 + 0x00]
+	add	%o0, 0x08, %o0
+	subcc	%o2, 0x08, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x08, %o1
+	retl
+	 std	%f32, [%o3 + 0x00]
+ENDPROC(des_sparc64_cbc_encrypt)
+
+	.align	32
+ENTRY(des_sparc64_cbc_decrypt)
+	/* %o0=input, %o1=output, %o2=len, %o3=IV */
+	ldd	[%o3 + 0x00], %f34
+1:	ldd	[%o0 + 0x00], %f36
+	DES_IP(36, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+	fxor	%f32, %f34, %f32
+	fsrc2	%f36, %f34
+	std	%f32, [%o1 + 0x00]
+	add	%o0, 0x08, %o0
+	subcc	%o2, 0x08, %o2
+	bne,pt	%icc, 1b
+	 add	%o1, 0x08, %o1
+	retl
+	 std	%f36, [%o3 + 0x00]
+ENDPROC(des_sparc64_cbc_decrypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_crypt)
+	/* %o0=key, %o1=input, %o2=output */
+	VISEntry
+	ldd	[%o1 + 0x00], %f32
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	ldd	[%o0 + 0x78], %f30
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	ldd	[%o0 + 0x80], %f0
+	ldd	[%o0 + 0x88], %f2
+	DES_ROUND(4, 6, 32, 32)
+	ldd	[%o0 + 0x90], %f4
+	ldd	[%o0 + 0x98], %f6
+	DES_ROUND(8, 10, 32, 32)
+	ldd	[%o0 + 0xa0], %f8
+	ldd	[%o0 + 0xa8], %f10
+	DES_ROUND(12, 14, 32, 32)
+	ldd	[%o0 + 0xb0], %f12
+	ldd	[%o0 + 0xb8], %f14
+	DES_ROUND(16, 18, 32, 32)
+	ldd	[%o0 + 0xc0], %f16
+	ldd	[%o0 + 0xc8], %f18
+	DES_ROUND(20, 22, 32, 32)
+	ldd	[%o0 + 0xd0], %f20
+	ldd	[%o0 + 0xd8], %f22
+	DES_ROUND(24, 26, 32, 32)
+	ldd	[%o0 + 0xe0], %f24
+	ldd	[%o0 + 0xe8], %f26
+	DES_ROUND(28, 30, 32, 32)
+	ldd	[%o0 + 0xf0], %f28
+	ldd	[%o0 + 0xf8], %f30
+	DES_IIP(32, 32)
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	ldd	[%o0 + 0x100], %f0
+	ldd	[%o0 + 0x108], %f2
+	DES_ROUND(4, 6, 32, 32)
+	ldd	[%o0 + 0x110], %f4
+	ldd	[%o0 + 0x118], %f6
+	DES_ROUND(8, 10, 32, 32)
+	ldd	[%o0 + 0x120], %f8
+	ldd	[%o0 + 0x128], %f10
+	DES_ROUND(12, 14, 32, 32)
+	ldd	[%o0 + 0x130], %f12
+	ldd	[%o0 + 0x138], %f14
+	DES_ROUND(16, 18, 32, 32)
+	ldd	[%o0 + 0x140], %f16
+	ldd	[%o0 + 0x148], %f18
+	DES_ROUND(20, 22, 32, 32)
+	ldd	[%o0 + 0x150], %f20
+	ldd	[%o0 + 0x158], %f22
+	DES_ROUND(24, 26, 32, 32)
+	ldd	[%o0 + 0x160], %f24
+	ldd	[%o0 + 0x168], %f26
+	DES_ROUND(28, 30, 32, 32)
+	ldd	[%o0 + 0x170], %f28
+	ldd	[%o0 + 0x178], %f30
+	DES_IIP(32, 32)
+	DES_IP(32, 32)
+	DES_ROUND(0, 2, 32, 32)
+	DES_ROUND(4, 6, 32, 32)
+	DES_ROUND(8, 10, 32, 32)
+	DES_ROUND(12, 14, 32, 32)
+	DES_ROUND(16, 18, 32, 32)
+	DES_ROUND(20, 22, 32, 32)
+	DES_ROUND(24, 26, 32, 32)
+	DES_ROUND(28, 30, 32, 32)
+	DES_IIP(32, 32)
+
+	std	%f32, [%o2 + 0x00]
+	retl
+	 VISExit
+ENDPROC(des3_ede_sparc64_crypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_load_keys)
+	/* %o0=key */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	ldd	[%o0 + 0x30], %f12
+	ldd	[%o0 + 0x38], %f14
+	ldd	[%o0 + 0x40], %f16
+	ldd	[%o0 + 0x48], %f18
+	ldd	[%o0 + 0x50], %f20
+	ldd	[%o0 + 0x58], %f22
+	ldd	[%o0 + 0x60], %f24
+	ldd	[%o0 + 0x68], %f26
+	ldd	[%o0 + 0x70], %f28
+	ldd	[%o0 + 0x78], %f30
+	ldd	[%o0 + 0x80], %f32
+	ldd	[%o0 + 0x88], %f34
+	ldd	[%o0 + 0x90], %f36
+	ldd	[%o0 + 0x98], %f38
+	ldd	[%o0 + 0xa0], %f40
+	ldd	[%o0 + 0xa8], %f42
+	ldd	[%o0 + 0xb0], %f44
+	ldd	[%o0 + 0xb8], %f46
+	ldd	[%o0 + 0xc0], %f48
+	ldd	[%o0 + 0xc8], %f50
+	ldd	[%o0 + 0xd0], %f52
+	ldd	[%o0 + 0xd8], %f54
+	ldd	[%o0 + 0xe0], %f56
+	retl
+	 ldd	[%o0 + 0xe8], %f58
+ENDPROC(des3_ede_sparc64_load_keys)
+
+#define DES3_LOOP_BODY(X) \
+	DES_IP(X, X) \
+	DES_ROUND(0, 2, X, X) \
+	DES_ROUND(4, 6, X, X) \
+	DES_ROUND(8, 10, X, X) \
+	DES_ROUND(12, 14, X, X) \
+	DES_ROUND(16, 18, X, X) \
+	ldd	[%o0 + 0xf0], %f16; \
+	ldd	[%o0 + 0xf8], %f18; \
+	DES_ROUND(20, 22, X, X) \
+	ldd	[%o0 + 0x100], %f20; \
+	ldd	[%o0 + 0x108], %f22; \
+	DES_ROUND(24, 26, X, X) \
+	ldd	[%o0 + 0x110], %f24; \
+	ldd	[%o0 + 0x118], %f26; \
+	DES_ROUND(28, 30, X, X) \
+	ldd	[%o0 + 0x120], %f28; \
+	ldd	[%o0 + 0x128], %f30; \
+	DES_IIP(X, X) \
+	DES_IP(X, X) \
+	DES_ROUND(32, 34, X, X) \
+	ldd	[%o0 + 0x130], %f0; \
+	ldd	[%o0 + 0x138], %f2; \
+	DES_ROUND(36, 38, X, X) \
+	ldd	[%o0 + 0x140], %f4; \
+	ldd	[%o0 + 0x148], %f6; \
+	DES_ROUND(40, 42, X, X) \
+	ldd	[%o0 + 0x150], %f8; \
+	ldd	[%o0 + 0x158], %f10; \
+	DES_ROUND(44, 46, X, X) \
+	ldd	[%o0 + 0x160], %f12; \
+	ldd	[%o0 + 0x168], %f14; \
+	DES_ROUND(48, 50, X, X) \
+	DES_ROUND(52, 54, X, X) \
+	DES_ROUND(56, 58, X, X) \
+	DES_ROUND(16, 18, X, X) \
+	ldd	[%o0 + 0x170], %f16; \
+	ldd	[%o0 + 0x178], %f18; \
+	DES_IIP(X, X) \
+	DES_IP(X, X) \
+	DES_ROUND(20, 22, X, X) \
+	ldd	[%o0 + 0x50], %f20; \
+	ldd	[%o0 + 0x58], %f22; \
+	DES_ROUND(24, 26, X, X) \
+	ldd	[%o0 + 0x60], %f24; \
+	ldd	[%o0 + 0x68], %f26; \
+	DES_ROUND(28, 30, X, X) \
+	ldd	[%o0 + 0x70], %f28; \
+	ldd	[%o0 + 0x78], %f30; \
+	DES_ROUND(0, 2, X, X) \
+	ldd	[%o0 + 0x00], %f0; \
+	ldd	[%o0 + 0x08], %f2; \
+	DES_ROUND(4, 6, X, X) \
+	ldd	[%o0 + 0x10], %f4; \
+	ldd	[%o0 + 0x18], %f6; \
+	DES_ROUND(8, 10, X, X) \
+	ldd	[%o0 + 0x20], %f8; \
+	ldd	[%o0 + 0x28], %f10; \
+	DES_ROUND(12, 14, X, X) \
+	ldd	[%o0 + 0x30], %f12; \
+	ldd	[%o0 + 0x38], %f14; \
+	DES_ROUND(16, 18, X, X) \
+	ldd	[%o0 + 0x40], %f16; \
+	ldd	[%o0 + 0x48], %f18; \
+	DES_IIP(X, X)
+
+	.align	32
+ENTRY(des3_ede_sparc64_ecb_crypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=len */
+1:	ldd	[%o1 + 0x00], %f60
+	DES3_LOOP_BODY(60)
+	std	%f60, [%o2 + 0x00]
+	subcc	%o3, 0x08, %o3
+	bne,pt	%icc, 1b
+	 add	%o2, 0x08, %o2
+	retl
+	 nop
+ENDPROC(des3_ede_sparc64_ecb_crypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_cbc_encrypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd	[%o4 + 0x00], %f60
+1:	ldd	[%o1 + 0x00], %f62
+	fxor	%f60, %f62, %f60
+	DES3_LOOP_BODY(60)
+	std	%f60, [%o2 + 0x00]
+	add	%o1, 0x08, %o1
+	subcc	%o3, 0x08, %o3
+	bne,pt	%icc, 1b
+	 add	%o2, 0x08, %o2
+	retl
+	 std	%f60, [%o4 + 0x00]
+ENDPROC(des3_ede_sparc64_cbc_encrypt)
+
+	.align	32
+ENTRY(des3_ede_sparc64_cbc_decrypt)
+	/* %o0=key, %o1=input, %o2=output, %o3=len, %o4=IV */
+	ldd	[%o4 + 0x00], %f62
+1:	ldx	[%o1 + 0x00], %g1
+	MOVXTOD_G1_F60
+	DES3_LOOP_BODY(60)
+	fxor	%f62, %f60, %f60
+	MOVXTOD_G1_F62
+	std	%f60, [%o2 + 0x00]
+	add	%o1, 0x08, %o1
+	subcc	%o3, 0x08, %o3
+	bne,pt	%icc, 1b
+	 add	%o2, 0x08, %o2
+	retl
+	 stx	%g1, [%o4 + 0x00]
+ENDPROC(des3_ede_sparc64_cbc_decrypt)
diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c
new file mode 100644
index 000000000000..c4940c2d3073
--- /dev/null
+++ b/arch/sparc/crypto/des_glue.c
@@ -0,0 +1,529 @@
+/* Glue code for DES encryption optimized for sparc64 crypto opcodes.
+ *
+ * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/crypto.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <crypto/algapi.h>
+#include <crypto/des.h>
+
+#include <asm/fpumacro.h>
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+struct des_sparc64_ctx {
+	u64 encrypt_expkey[DES_EXPKEY_WORDS / 2];
+	u64 decrypt_expkey[DES_EXPKEY_WORDS / 2];
+};
+
+struct des3_ede_sparc64_ctx {
+	u64 encrypt_expkey[DES3_EDE_EXPKEY_WORDS / 2];
+	u64 decrypt_expkey[DES3_EDE_EXPKEY_WORDS / 2];
+};
+
+static void encrypt_to_decrypt(u64 *d, const u64 *e)
+{
+	const u64 *s = e + (DES_EXPKEY_WORDS / 2) - 1;
+	int i;
+
+	for (i = 0; i < DES_EXPKEY_WORDS / 2; i++)
+		*d++ = *s--;
+}
+
+extern void des_sparc64_key_expand(const u32 *input_key, u64 *key);
+
+static int des_set_key(struct crypto_tfm *tfm, const u8 *key,
+		       unsigned int keylen)
+{
+	struct des_sparc64_ctx *dctx = crypto_tfm_ctx(tfm);
+	u32 *flags = &tfm->crt_flags;
+	u32 tmp[DES_EXPKEY_WORDS];
+	int ret;
+
+	/* Even though we have special instructions for key expansion,
+	 * we call des_ekey() so that we don't have to write our own
+	 * weak key detection code.
+	 */
+	ret = des_ekey(tmp, key);
+	if (unlikely(ret == 0) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) {
+		*flags |= CRYPTO_TFM_RES_WEAK_KEY;
+		return -EINVAL;
+	}
+
+	des_sparc64_key_expand((const u32 *) key, &dctx->encrypt_expkey[0]);
+	encrypt_to_decrypt(&dctx->decrypt_expkey[0], &dctx->encrypt_expkey[0]);
+
+	return 0;
+}
+
+extern void des_sparc64_crypt(const u64 *key, const u64 *input,
+			      u64 *output);
+
+static void des_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->encrypt_expkey;
+
+	des_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+static void des_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->decrypt_expkey;
+
+	des_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+extern void des_sparc64_load_keys(const u64 *key);
+
+extern void des_sparc64_ecb_crypt(const u64 *input, u64 *output,
+				  unsigned int len);
+
+#define DES_BLOCK_MASK	(~(DES_BLOCK_SIZE - 1))
+
+static int __ecb_crypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes, bool encrypt)
+{
+	struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	if (encrypt)
+		des_sparc64_load_keys(&ctx->encrypt_expkey[0]);
+	else
+		des_sparc64_load_keys(&ctx->decrypt_expkey[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			des_sparc64_ecb_crypt((const u64 *)walk.src.virt.addr,
+					      (u64 *) walk.dst.virt.addr,
+					      block_len);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, true);
+}
+
+static int ecb_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb_crypt(desc, dst, src, nbytes, false);
+}
+
+extern void des_sparc64_cbc_encrypt(const u64 *input, u64 *output,
+				    unsigned int len, u64 *iv);
+
+static int cbc_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	des_sparc64_load_keys(&ctx->encrypt_expkey[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			des_sparc64_cbc_encrypt((const u64 *)walk.src.virt.addr,
+						(u64 *) walk.dst.virt.addr,
+						block_len, (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+extern void des_sparc64_cbc_decrypt(const u64 *input, u64 *output,
+				    unsigned int len, u64 *iv);
+
+static int cbc_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	struct des_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	des_sparc64_load_keys(&ctx->decrypt_expkey[0]);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			des_sparc64_cbc_decrypt((const u64 *)walk.src.virt.addr,
+						(u64 *) walk.dst.virt.addr,
+						block_len, (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int des3_ede_set_key(struct crypto_tfm *tfm, const u8 *key,
+			    unsigned int keylen)
+{
+	struct des3_ede_sparc64_ctx *dctx = crypto_tfm_ctx(tfm);
+	const u32 *K = (const u32 *)key;
+	u32 *flags = &tfm->crt_flags;
+	u64 k1[DES_EXPKEY_WORDS / 2];
+	u64 k2[DES_EXPKEY_WORDS / 2];
+	u64 k3[DES_EXPKEY_WORDS / 2];
+
+	if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) ||
+		     !((K[2] ^ K[4]) | (K[3] ^ K[5]))) &&
+		     (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) {
+		*flags |= CRYPTO_TFM_RES_WEAK_KEY;
+		return -EINVAL;
+	}
+
+	des_sparc64_key_expand((const u32 *)key, k1);
+	key += DES_KEY_SIZE;
+	des_sparc64_key_expand((const u32 *)key, k2);
+	key += DES_KEY_SIZE;
+	des_sparc64_key_expand((const u32 *)key, k3);
+
+	memcpy(&dctx->encrypt_expkey[0], &k1[0], sizeof(k1));
+	encrypt_to_decrypt(&dctx->encrypt_expkey[DES_EXPKEY_WORDS / 2], &k2[0]);
+	memcpy(&dctx->encrypt_expkey[(DES_EXPKEY_WORDS / 2) * 2],
+	       &k3[0], sizeof(k3));
+
+	encrypt_to_decrypt(&dctx->decrypt_expkey[0], &k3[0]);
+	memcpy(&dctx->decrypt_expkey[DES_EXPKEY_WORDS / 2],
+	       &k2[0], sizeof(k2));
+	encrypt_to_decrypt(&dctx->decrypt_expkey[(DES_EXPKEY_WORDS / 2) * 2],
+			   &k1[0]);
+
+	return 0;
+}
+
+extern void des3_ede_sparc64_crypt(const u64 *key, const u64 *input,
+				   u64 *output);
+
+static void des3_ede_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->encrypt_expkey;
+
+	des3_ede_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+static void des3_ede_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_tfm_ctx(tfm);
+	const u64 *K = ctx->decrypt_expkey;
+
+	des3_ede_sparc64_crypt(K, (const u64 *) src, (u64 *) dst);
+}
+
+extern void des3_ede_sparc64_load_keys(const u64 *key);
+
+extern void des3_ede_sparc64_ecb_crypt(const u64 *expkey, const u64 *input,
+				       u64 *output, unsigned int len);
+
+static int __ecb3_crypt(struct blkcipher_desc *desc,
+			struct scatterlist *dst, struct scatterlist *src,
+			unsigned int nbytes, bool encrypt)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	const u64 *K;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	if (encrypt)
+		K = &ctx->encrypt_expkey[0];
+	else
+		K = &ctx->decrypt_expkey[0];
+	des3_ede_sparc64_load_keys(K);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64 = (const u64 *)walk.src.virt.addr;
+			des3_ede_sparc64_ecb_crypt(K, src64,
+						   (u64 *) walk.dst.virt.addr,
+						   block_len);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static int ecb3_encrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb3_crypt(desc, dst, src, nbytes, true);
+}
+
+static int ecb3_decrypt(struct blkcipher_desc *desc,
+		       struct scatterlist *dst, struct scatterlist *src,
+		       unsigned int nbytes)
+{
+	return __ecb3_crypt(desc, dst, src, nbytes, false);
+}
+
+extern void des3_ede_sparc64_cbc_encrypt(const u64 *expkey, const u64 *input,
+					 u64 *output, unsigned int len,
+					 u64 *iv);
+
+static int cbc3_encrypt(struct blkcipher_desc *desc,
+			struct scatterlist *dst, struct scatterlist *src,
+			unsigned int nbytes)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	const u64 *K;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	K = &ctx->encrypt_expkey[0];
+	des3_ede_sparc64_load_keys(K);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64 = (const u64 *)walk.src.virt.addr;
+			des3_ede_sparc64_cbc_encrypt(K, src64,
+						     (u64 *) walk.dst.virt.addr,
+						     block_len,
+						     (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+extern void des3_ede_sparc64_cbc_decrypt(const u64 *expkey, const u64 *input,
+					 u64 *output, unsigned int len,
+					 u64 *iv);
+
+static int cbc3_decrypt(struct blkcipher_desc *desc,
+			struct scatterlist *dst, struct scatterlist *src,
+			unsigned int nbytes)
+{
+	struct des3_ede_sparc64_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+	struct blkcipher_walk walk;
+	const u64 *K;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt(desc, &walk);
+
+	K = &ctx->decrypt_expkey[0];
+	des3_ede_sparc64_load_keys(K);
+	while ((nbytes = walk.nbytes)) {
+		unsigned int block_len = nbytes & DES_BLOCK_MASK;
+
+		if (likely(block_len)) {
+			const u64 *src64 = (const u64 *)walk.src.virt.addr;
+			des3_ede_sparc64_cbc_decrypt(K, src64,
+						     (u64 *) walk.dst.virt.addr,
+						     block_len,
+						     (u64 *) walk.iv);
+		}
+		nbytes &= DES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	fprs_write(0);
+	return err;
+}
+
+static struct crypto_alg algs[] = { {
+	.cra_name		= "des",
+	.cra_driver_name	= "des-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= DES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= DES_KEY_SIZE,
+			.cia_max_keysize	= DES_KEY_SIZE,
+			.cia_setkey		= des_set_key,
+			.cia_encrypt		= des_encrypt,
+			.cia_decrypt		= des_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(des)",
+	.cra_driver_name	= "ecb-des-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES_KEY_SIZE,
+			.max_keysize	= DES_KEY_SIZE,
+			.setkey		= des_set_key,
+			.encrypt	= ecb_encrypt,
+			.decrypt	= ecb_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(des)",
+	.cra_driver_name	= "cbc-des-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES_KEY_SIZE,
+			.max_keysize	= DES_KEY_SIZE,
+			.setkey		= des_set_key,
+			.encrypt	= cbc_encrypt,
+			.decrypt	= cbc_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "des3_ede",
+	.cra_driver_name	= "des3_ede-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_module		= THIS_MODULE,
+	.cra_u	= {
+		.cipher	= {
+			.cia_min_keysize	= DES3_EDE_KEY_SIZE,
+			.cia_max_keysize	= DES3_EDE_KEY_SIZE,
+			.cia_setkey		= des3_ede_set_key,
+			.cia_encrypt		= des3_ede_encrypt,
+			.cia_decrypt		= des3_ede_decrypt
+		}
+	}
+}, {
+	.cra_name		= "ecb(des3_ede)",
+	.cra_driver_name	= "ecb-des3_ede-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES3_EDE_KEY_SIZE,
+			.max_keysize	= DES3_EDE_KEY_SIZE,
+			.setkey		= des3_ede_set_key,
+			.encrypt	= ecb3_encrypt,
+			.decrypt	= ecb3_decrypt,
+		},
+	},
+}, {
+	.cra_name		= "cbc(des3_ede)",
+	.cra_driver_name	= "cbc-des3_ede-sparc64",
+	.cra_priority		= SPARC_CR_OPCODE_PRIORITY,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= DES3_EDE_BLOCK_SIZE,
+	.cra_ctxsize		= sizeof(struct des3_ede_sparc64_ctx),
+	.cra_alignmask		= 7,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= DES3_EDE_KEY_SIZE,
+			.max_keysize	= DES3_EDE_KEY_SIZE,
+			.setkey		= des3_ede_set_key,
+			.encrypt	= cbc3_encrypt,
+			.decrypt	= cbc3_decrypt,
+		},
+	},
+} };
+
+static bool __init sparc64_has_des_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_DES))
+		return false;
+
+	return true;
+}
+
+static int __init des_sparc64_mod_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(algs); i++)
+		INIT_LIST_HEAD(&algs[i].cra_list);
+
+	if (sparc64_has_des_opcode()) {
+		pr_info("Using sparc64 des opcodes optimized DES implementation\n");
+		return crypto_register_algs(algs, ARRAY_SIZE(algs));
+	}
+	pr_info("sparc64 des opcodes not available.\n");
+	return -ENODEV;
+}
+
+static void __exit des_sparc64_mod_fini(void)
+{
+	crypto_unregister_algs(algs, ARRAY_SIZE(algs));
+}
+
+module_init(des_sparc64_mod_init);
+module_exit(des_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("DES & Triple DES EDE Cipher Algorithms, sparc64 des opcode accelerated");
+
+MODULE_ALIAS("des");
diff --git a/arch/sparc/crypto/md5_asm.S b/arch/sparc/crypto/md5_asm.S
new file mode 100644
index 000000000000..3150404e602e
--- /dev/null
+++ b/arch/sparc/crypto/md5_asm.S
@@ -0,0 +1,70 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(md5_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x08], %f2
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x0c], %f3
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	MD5
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	MD5
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(md5_sparc64_transform)
diff --git a/arch/sparc/crypto/md5_glue.c b/arch/sparc/crypto/md5_glue.c
new file mode 100644
index 000000000000..603d723038ce
--- /dev/null
+++ b/arch/sparc/crypto/md5_glue.c
@@ -0,0 +1,188 @@
+/* Glue code for MD5 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c
+ * and crypto/md5.c which are:
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ * Copyright (c) Cryptoapi developers.
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/md5.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void md5_sparc64_transform(u32 *digest, const char *data,
+				      unsigned int rounds);
+
+static int md5_sparc64_init(struct shash_desc *desc)
+{
+	struct md5_state *mctx = shash_desc_ctx(desc);
+
+	mctx->hash[0] = cpu_to_le32(0x67452301);
+	mctx->hash[1] = cpu_to_le32(0xefcdab89);
+	mctx->hash[2] = cpu_to_le32(0x98badcfe);
+	mctx->hash[3] = cpu_to_le32(0x10325476);
+	mctx->byte_count = 0;
+
+	return 0;
+}
+
+static void __md5_sparc64_update(struct md5_state *sctx, const u8 *data,
+				 unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	sctx->byte_count += len;
+	if (partial) {
+		done = MD5_HMAC_BLOCK_SIZE - partial;
+		memcpy((u8 *)sctx->block + partial, data, done);
+		md5_sparc64_transform(sctx->hash, (u8 *)sctx->block, 1);
+	}
+	if (len - done >= MD5_HMAC_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / MD5_HMAC_BLOCK_SIZE;
+
+		md5_sparc64_transform(sctx->hash, data + done, rounds);
+		done += rounds * MD5_HMAC_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->block, data + done, len - done);
+}
+
+static int md5_sparc64_update(struct shash_desc *desc, const u8 *data,
+			      unsigned int len)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->byte_count % MD5_HMAC_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < MD5_HMAC_BLOCK_SIZE) {
+		sctx->byte_count += len;
+		memcpy((u8 *)sctx->block + partial, data, len);
+	} else
+		__md5_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+/* Add padding and return the message digest. */
+static int md5_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	u32 *dst = (u32 *)out;
+	__le64 bits;
+	static const u8 padding[MD5_HMAC_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_le64(sctx->byte_count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->byte_count % MD5_HMAC_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((MD5_HMAC_BLOCK_SIZE+56) - index);
+
+	/* We need to fill a whole block for __md5_sparc64_update() */
+	if (padlen <= 56) {
+		sctx->byte_count += padlen;
+		memcpy((u8 *)sctx->block + index, padding, padlen);
+	} else {
+		__md5_sparc64_update(sctx, padding, padlen, index);
+	}
+	__md5_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < MD5_HASH_WORDS; i++)
+		dst[i] = sctx->hash[i];
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int md5_sparc64_export(struct shash_desc *desc, void *out)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int md5_sparc64_import(struct shash_desc *desc, const void *in)
+{
+	struct md5_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	MD5_DIGEST_SIZE,
+	.init		=	md5_sparc64_init,
+	.update		=	md5_sparc64_update,
+	.final		=	md5_sparc64_final,
+	.export		=	md5_sparc64_export,
+	.import		=	md5_sparc64_import,
+	.descsize	=	sizeof(struct md5_state),
+	.statesize	=	sizeof(struct md5_state),
+	.base		=	{
+		.cra_name	=	"md5",
+		.cra_driver_name=	"md5-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	MD5_HMAC_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_md5_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_MD5))
+		return false;
+
+	return true;
+}
+
+static int __init md5_sparc64_mod_init(void)
+{
+	if (sparc64_has_md5_opcode()) {
+		pr_info("Using sparc64 md5 opcode optimized MD5 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("sparc64 md5 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit md5_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(md5_sparc64_mod_init);
+module_exit(md5_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MD5 Secure Hash Algorithm, sparc64 md5 opcode accelerated");
+
+MODULE_ALIAS("md5");
diff --git a/arch/sparc/crypto/opcodes.h b/arch/sparc/crypto/opcodes.h
new file mode 100644
index 000000000000..19cbaea6976f
--- /dev/null
+++ b/arch/sparc/crypto/opcodes.h
@@ -0,0 +1,99 @@
+#ifndef _OPCODES_H
+#define _OPCODES_H
+
+#define SPARC_CR_OPCODE_PRIORITY	300
+
+#define F3F(x,y,z)	(((x)<<30)|((y)<<19)|((z)<<5))
+
+#define FPD_ENCODE(x)	(((x) >> 5) | ((x) & ~(0x20)))
+
+#define RS1(x)		(FPD_ENCODE(x) << 14)
+#define RS2(x)		(FPD_ENCODE(x) <<  0)
+#define RS3(x)		(FPD_ENCODE(x) <<  9)
+#define RD(x)		(FPD_ENCODE(x) << 25)
+#define IMM5_0(x)	((x)           <<  0)
+#define IMM5_9(x)	((x)           <<  9)
+
+#define CRC32C(a,b,c)	\
+	.word		(F3F(2,0x36,0x147)|RS1(a)|RS2(b)|RD(c));
+
+#define MD5		\
+	.word	0x81b02800;
+#define SHA1		\
+	.word	0x81b02820;
+#define SHA256		\
+	.word	0x81b02840;
+#define SHA512		\
+	.word	0x81b02860;
+
+#define AES_EROUND01(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 0)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND23(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 1)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND01(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 2)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND23(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 3)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND01_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 4)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_EROUND23_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 5)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND01_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 6)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_DROUND23_L(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 7)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define AES_KEXPAND1(a,b,c,d)	\
+	.word	(F3F(2, 0x19, 8)|RS1(a)|RS2(b)|IMM5_9(c)|RD(d));
+#define AES_KEXPAND0(a,b,c)	\
+	.word	(F3F(2, 0x36, 0x130)|RS1(a)|RS2(b)|RD(c));
+#define AES_KEXPAND2(a,b,c)	\
+	.word	(F3F(2, 0x36, 0x131)|RS1(a)|RS2(b)|RD(c));
+
+#define DES_IP(a,b)		\
+	.word		(F3F(2, 0x36, 0x134)|RS1(a)|RD(b));
+#define DES_IIP(a,b)		\
+	.word		(F3F(2, 0x36, 0x135)|RS1(a)|RD(b));
+#define DES_KEXPAND(a,b,c)	\
+	.word		(F3F(2, 0x36, 0x136)|RS1(a)|IMM5_0(b)|RD(c));
+#define DES_ROUND(a,b,c,d)	\
+	.word		(F3F(2, 0x19, 0x009)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+
+#define CAMELLIA_F(a,b,c,d)		\
+	.word		(F3F(2, 0x19, 0x00c)|RS1(a)|RS2(b)|RS3(c)|RD(d));
+#define CAMELLIA_FL(a,b,c)		\
+	.word		(F3F(2, 0x36, 0x13c)|RS1(a)|RS2(b)|RD(c));
+#define CAMELLIA_FLI(a,b,c)		\
+	.word		(F3F(2, 0x36, 0x13d)|RS1(a)|RS2(b)|RD(c));
+
+#define MOVDTOX_F0_O4		\
+	.word	0x99b02200
+#define MOVDTOX_F2_O5		\
+	.word	0x9bb02202
+#define MOVXTOD_G1_F60 		\
+	.word	0xbbb02301
+#define MOVXTOD_G1_F62 		\
+	.word	0xbfb02301
+#define MOVXTOD_G3_F4		\
+	.word	0x89b02303;
+#define MOVXTOD_G7_F6		\
+	.word	0x8db02307;
+#define MOVXTOD_G3_F0		\
+	.word	0x81b02303;
+#define MOVXTOD_G7_F2		\
+	.word	0x85b02307;
+#define MOVXTOD_O0_F0		\
+	.word	0x81b02308;
+#define MOVXTOD_O5_F0		\
+	.word	0x81b0230d;
+#define MOVXTOD_O5_F2		\
+	.word	0x85b0230d;
+#define MOVXTOD_O5_F4		\
+	.word	0x89b0230d;
+#define MOVXTOD_O5_F6		\
+	.word	0x8db0230d;
+#define MOVXTOD_G3_F60		\
+	.word	0xbbb02303;
+#define MOVXTOD_G7_F62		\
+	.word	0xbfb02307;
+
+#endif /* _OPCODES_H */
diff --git a/arch/sparc/crypto/sha1_asm.S b/arch/sparc/crypto/sha1_asm.S
new file mode 100644
index 000000000000..219d10c5ae0e
--- /dev/null
+++ b/arch/sparc/crypto/sha1_asm.S
@@ -0,0 +1,72 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(sha1_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x0c], %f3
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x10], %f4
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA1
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha1_sparc64_transform)
diff --git a/arch/sparc/crypto/sha1_glue.c b/arch/sparc/crypto/sha1_glue.c
new file mode 100644
index 000000000000..2bbb20bee9f1
--- /dev/null
+++ b/arch/sparc/crypto/sha1_glue.c
@@ -0,0 +1,183 @@
+/* Glue code for SHA1 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon arch/x86/crypto/sha1_ssse3_glue.c
+ *
+ * Copyright (c) Alan Smithee.
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ * Copyright (c) Mathias Krause <minipli@googlemail.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void sha1_sparc64_transform(u32 *digest, const char *data,
+				       unsigned int rounds);
+
+static int sha1_sparc64_init(struct shash_desc *desc)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha1_state){
+		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+	};
+
+	return 0;
+}
+
+static void __sha1_sparc64_update(struct sha1_state *sctx, const u8 *data,
+				  unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	sctx->count += len;
+	if (partial) {
+		done = SHA1_BLOCK_SIZE - partial;
+		memcpy(sctx->buffer + partial, data, done);
+		sha1_sparc64_transform(sctx->state, sctx->buffer, 1);
+	}
+	if (len - done >= SHA1_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+		sha1_sparc64_transform(sctx->state, data + done, rounds);
+		done += rounds * SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buffer, data + done, len - done);
+}
+
+static int sha1_sparc64_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA1_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buffer + partial, data, len);
+	} else
+		__sha1_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+/* Add padding and return the message digest. */
+static int sha1_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA1_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
+
+	/* We need to fill a whole block for __sha1_sparc64_update() */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buffer + index, padding, padlen);
+	} else {
+		__sha1_sparc64_update(sctx, padding, padlen, index);
+	}
+	__sha1_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 5; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_sparc64_export(struct shash_desc *desc, void *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_sparc64_import(struct shash_desc *desc, const void *in)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_sparc64_init,
+	.update		=	sha1_sparc64_update,
+	.final		=	sha1_sparc64_final,
+	.export		=	sha1_sparc64_export,
+	.import		=	sha1_sparc64_import,
+	.descsize	=	sizeof(struct sha1_state),
+	.statesize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name	=	"sha1",
+		.cra_driver_name=	"sha1-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA1_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_sha1_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA1))
+		return false;
+
+	return true;
+}
+
+static int __init sha1_sparc64_mod_init(void)
+{
+	if (sparc64_has_sha1_opcode()) {
+		pr_info("Using sparc64 sha1 opcode optimized SHA-1 implementation\n");
+		return crypto_register_shash(&alg);
+	}
+	pr_info("sparc64 sha1 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit sha1_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_sparc64_mod_init);
+module_exit(sha1_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, sparc64 sha1 opcode accelerated");
+
+MODULE_ALIAS("sha1");
diff --git a/arch/sparc/crypto/sha256_asm.S b/arch/sparc/crypto/sha256_asm.S
new file mode 100644
index 000000000000..b5f3d5826eb4
--- /dev/null
+++ b/arch/sparc/crypto/sha256_asm.S
@@ -0,0 +1,78 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(sha256_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntryHalf
+	ld	[%o0 + 0x00], %f0
+	ld	[%o0 + 0x04], %f1
+	ld	[%o0 + 0x08], %f2
+	ld	[%o0 + 0x0c], %f3
+	ld	[%o0 + 0x10], %f4
+	ld	[%o0 + 0x14], %f5
+	andcc	%o1, 0x7, %g0
+	ld	[%o0 + 0x18], %f6
+	bne,pn	%xcc, 10f
+	 ld	[%o0 + 0x1c], %f7
+
+1:
+	ldd	[%o1 + 0x00], %f8
+	ldd	[%o1 + 0x08], %f10
+	ldd	[%o1 + 0x10], %f12
+	ldd	[%o1 + 0x18], %f14
+	ldd	[%o1 + 0x20], %f16
+	ldd	[%o1 + 0x28], %f18
+	ldd	[%o1 + 0x30], %f20
+	ldd	[%o1 + 0x38], %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+5:
+	st	%f0, [%o0 + 0x00]
+	st	%f1, [%o0 + 0x04]
+	st	%f2, [%o0 + 0x08]
+	st	%f3, [%o0 + 0x0c]
+	st	%f4, [%o0 + 0x10]
+	st	%f5, [%o0 + 0x14]
+	st	%f6, [%o0 + 0x18]
+	st	%f7, [%o0 + 0x1c]
+	retl
+	 VISExitHalf
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f10
+1:
+	ldd	[%o1 + 0x08], %f12
+	ldd	[%o1 + 0x10], %f14
+	ldd	[%o1 + 0x18], %f16
+	ldd	[%o1 + 0x20], %f18
+	ldd	[%o1 + 0x28], %f20
+	ldd	[%o1 + 0x30], %f22
+	ldd	[%o1 + 0x38], %f24
+	ldd	[%o1 + 0x40], %f26
+
+	faligndata %f10, %f12, %f8
+	faligndata %f12, %f14, %f10
+	faligndata %f14, %f16, %f12
+	faligndata %f16, %f18, %f14
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+
+	SHA256
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f26, %f10
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x40, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha256_sparc64_transform)
diff --git a/arch/sparc/crypto/sha256_glue.c b/arch/sparc/crypto/sha256_glue.c
new file mode 100644
index 000000000000..591e656bd891
--- /dev/null
+++ b/arch/sparc/crypto/sha256_glue.c
@@ -0,0 +1,241 @@
+/* Glue code for SHA256 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon crypto/sha256_generic.c
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * SHA224 Support Copyright 2007 Intel Corporation <jonathan.lynch@intel.com>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void sha256_sparc64_transform(u32 *digest, const char *data,
+					 unsigned int rounds);
+
+static int sha224_sparc64_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA224_H0;
+	sctx->state[1] = SHA224_H1;
+	sctx->state[2] = SHA224_H2;
+	sctx->state[3] = SHA224_H3;
+	sctx->state[4] = SHA224_H4;
+	sctx->state[5] = SHA224_H5;
+	sctx->state[6] = SHA224_H6;
+	sctx->state[7] = SHA224_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static int sha256_sparc64_init(struct shash_desc *desc)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA256_H0;
+	sctx->state[1] = SHA256_H1;
+	sctx->state[2] = SHA256_H2;
+	sctx->state[3] = SHA256_H3;
+	sctx->state[4] = SHA256_H4;
+	sctx->state[5] = SHA256_H5;
+	sctx->state[6] = SHA256_H6;
+	sctx->state[7] = SHA256_H7;
+	sctx->count = 0;
+
+	return 0;
+}
+
+static void __sha256_sparc64_update(struct sha256_state *sctx, const u8 *data,
+				    unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	sctx->count += len;
+	if (partial) {
+		done = SHA256_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha256_sparc64_transform(sctx->state, sctx->buf, 1);
+	}
+	if (len - done >= SHA256_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
+
+		sha256_sparc64_transform(sctx->state, data + done, rounds);
+		done += rounds * SHA256_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+}
+
+static int sha256_sparc64_update(struct shash_desc *desc, const u8 *data,
+				 unsigned int len)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA256_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buf + partial, data, len);
+	} else
+		__sha256_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+static int sha256_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA256_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56) - index);
+
+	/* We need to fill a whole block for __sha256_sparc64_update() */
+	if (padlen <= 56) {
+		sctx->count += padlen;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha256_sparc64_update(sctx, padding, padlen, index);
+	}
+	__sha256_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 56);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha224_sparc64_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[SHA256_DIGEST_SIZE];
+
+	sha256_sparc64_final(desc, D);
+
+	memcpy(hash, D, SHA224_DIGEST_SIZE);
+	memset(D, 0, SHA256_DIGEST_SIZE);
+
+	return 0;
+}
+
+static int sha256_sparc64_export(struct shash_desc *desc, void *out)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+	return 0;
+}
+
+static int sha256_sparc64_import(struct shash_desc *desc, const void *in)
+{
+	struct sha256_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+	return 0;
+}
+
+static struct shash_alg sha256 = {
+	.digestsize	=	SHA256_DIGEST_SIZE,
+	.init		=	sha256_sparc64_init,
+	.update		=	sha256_sparc64_update,
+	.final		=	sha256_sparc64_final,
+	.export		=	sha256_sparc64_export,
+	.import		=	sha256_sparc64_import,
+	.descsize	=	sizeof(struct sha256_state),
+	.statesize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha256",
+		.cra_driver_name=	"sha256-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA256_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static struct shash_alg sha224 = {
+	.digestsize	=	SHA224_DIGEST_SIZE,
+	.init		=	sha224_sparc64_init,
+	.update		=	sha256_sparc64_update,
+	.final		=	sha224_sparc64_final,
+	.descsize	=	sizeof(struct sha256_state),
+	.base		=	{
+		.cra_name	=	"sha224",
+		.cra_driver_name=	"sha224-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA224_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_sha256_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA256))
+		return false;
+
+	return true;
+}
+
+static int __init sha256_sparc64_mod_init(void)
+{
+	if (sparc64_has_sha256_opcode()) {
+		int ret = crypto_register_shash(&sha224);
+		if (ret < 0)
+			return ret;
+
+		ret = crypto_register_shash(&sha256);
+		if (ret < 0) {
+			crypto_unregister_shash(&sha224);
+			return ret;
+		}
+
+		pr_info("Using sparc64 sha256 opcode optimized SHA-256/SHA-224 implementation\n");
+		return 0;
+	}
+	pr_info("sparc64 sha256 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit sha256_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&sha224);
+	crypto_unregister_shash(&sha256);
+}
+
+module_init(sha256_sparc64_mod_init);
+module_exit(sha256_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-224 and SHA-256 Secure Hash Algorithm, sparc64 sha256 opcode accelerated");
+
+MODULE_ALIAS("sha224");
+MODULE_ALIAS("sha256");
diff --git a/arch/sparc/crypto/sha512_asm.S b/arch/sparc/crypto/sha512_asm.S
new file mode 100644
index 000000000000..54bfba713c0e
--- /dev/null
+++ b/arch/sparc/crypto/sha512_asm.S
@@ -0,0 +1,102 @@
+#include <linux/linkage.h>
+#include <asm/visasm.h>
+
+#include "opcodes.h"
+
+ENTRY(sha512_sparc64_transform)
+	/* %o0 = digest, %o1 = data, %o2 = rounds */
+	VISEntry
+	ldd	[%o0 + 0x00], %f0
+	ldd	[%o0 + 0x08], %f2
+	ldd	[%o0 + 0x10], %f4
+	ldd	[%o0 + 0x18], %f6
+	ldd	[%o0 + 0x20], %f8
+	ldd	[%o0 + 0x28], %f10
+	andcc	%o1, 0x7, %g0
+	ldd	[%o0 + 0x30], %f12
+	bne,pn	%xcc, 10f
+	 ldd	[%o0 + 0x38], %f14
+
+1:
+	ldd	[%o1 + 0x00], %f16
+	ldd	[%o1 + 0x08], %f18
+	ldd	[%o1 + 0x10], %f20
+	ldd	[%o1 + 0x18], %f22
+	ldd	[%o1 + 0x20], %f24
+	ldd	[%o1 + 0x28], %f26
+	ldd	[%o1 + 0x30], %f28
+	ldd	[%o1 + 0x38], %f30
+	ldd	[%o1 + 0x40], %f32
+	ldd	[%o1 + 0x48], %f34
+	ldd	[%o1 + 0x50], %f36
+	ldd	[%o1 + 0x58], %f38
+	ldd	[%o1 + 0x60], %f40
+	ldd	[%o1 + 0x68], %f42
+	ldd	[%o1 + 0x70], %f44
+	ldd	[%o1 + 0x78], %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+5:
+	std	%f0, [%o0 + 0x00]
+	std	%f2, [%o0 + 0x08]
+	std	%f4, [%o0 + 0x10]
+	std	%f6, [%o0 + 0x18]
+	std	%f8, [%o0 + 0x20]
+	std	%f10, [%o0 + 0x28]
+	std	%f12, [%o0 + 0x30]
+	std	%f14, [%o0 + 0x38]
+	retl
+	 VISExit
+10:
+	alignaddr %o1, %g0, %o1
+
+	ldd	[%o1 + 0x00], %f18
+1:
+	ldd	[%o1 + 0x08], %f20
+	ldd	[%o1 + 0x10], %f22
+	ldd	[%o1 + 0x18], %f24
+	ldd	[%o1 + 0x20], %f26
+	ldd	[%o1 + 0x28], %f28
+	ldd	[%o1 + 0x30], %f30
+	ldd	[%o1 + 0x38], %f32
+	ldd	[%o1 + 0x40], %f34
+	ldd	[%o1 + 0x48], %f36
+	ldd	[%o1 + 0x50], %f38
+	ldd	[%o1 + 0x58], %f40
+	ldd	[%o1 + 0x60], %f42
+	ldd	[%o1 + 0x68], %f44
+	ldd	[%o1 + 0x70], %f46
+	ldd	[%o1 + 0x78], %f48
+	ldd	[%o1 + 0x80], %f50
+
+	faligndata %f18, %f20, %f16
+	faligndata %f20, %f22, %f18
+	faligndata %f22, %f24, %f20
+	faligndata %f24, %f26, %f22
+	faligndata %f26, %f28, %f24
+	faligndata %f28, %f30, %f26
+	faligndata %f30, %f32, %f28
+	faligndata %f32, %f34, %f30
+	faligndata %f34, %f36, %f32
+	faligndata %f36, %f38, %f34
+	faligndata %f38, %f40, %f36
+	faligndata %f40, %f42, %f38
+	faligndata %f42, %f44, %f40
+	faligndata %f44, %f46, %f42
+	faligndata %f46, %f48, %f44
+	faligndata %f48, %f50, %f46
+
+	SHA512
+
+	subcc	%o2, 1, %o2
+	fsrc2	%f50, %f18
+	bne,pt	%xcc, 1b
+	 add	%o1, 0x80, %o1
+
+	ba,a,pt	%xcc, 5b
+ENDPROC(sha512_sparc64_transform)
diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c
new file mode 100644
index 000000000000..486f0a2b7001
--- /dev/null
+++ b/arch/sparc/crypto/sha512_glue.c
@@ -0,0 +1,226 @@
+/* Glue code for SHA512 hashing optimized for sparc64 crypto opcodes.
+ *
+ * This is based largely upon crypto/sha512_generic.c
+ *
+ * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com>
+ * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ * Copyright (c) 2003 Kyle McMartin <kyle@debian.org>
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+
+#include <asm/pstate.h>
+#include <asm/elf.h>
+
+#include "opcodes.h"
+
+asmlinkage void sha512_sparc64_transform(u64 *digest, const char *data,
+					 unsigned int rounds);
+
+static int sha512_sparc64_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA512_H0;
+	sctx->state[1] = SHA512_H1;
+	sctx->state[2] = SHA512_H2;
+	sctx->state[3] = SHA512_H3;
+	sctx->state[4] = SHA512_H4;
+	sctx->state[5] = SHA512_H5;
+	sctx->state[6] = SHA512_H6;
+	sctx->state[7] = SHA512_H7;
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int sha384_sparc64_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	sctx->state[0] = SHA384_H0;
+	sctx->state[1] = SHA384_H1;
+	sctx->state[2] = SHA384_H2;
+	sctx->state[3] = SHA384_H3;
+	sctx->state[4] = SHA384_H4;
+	sctx->state[5] = SHA384_H5;
+	sctx->state[6] = SHA384_H6;
+	sctx->state[7] = SHA384_H7;
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static void __sha512_sparc64_update(struct sha512_state *sctx, const u8 *data,
+				    unsigned int len, unsigned int partial)
+{
+	unsigned int done = 0;
+
+	if ((sctx->count[0] += len) < len)
+		sctx->count[1]++;
+	if (partial) {
+		done = SHA512_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha512_sparc64_transform(sctx->state, sctx->buf, 1);
+	}
+	if (len - done >= SHA512_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+
+		sha512_sparc64_transform(sctx->state, data + done, rounds);
+		done += rounds * SHA512_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+}
+
+static int sha512_sparc64_update(struct shash_desc *desc, const u8 *data,
+				 unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA512_BLOCK_SIZE) {
+		if ((sctx->count[0] += len) < len)
+			sctx->count[1]++;
+		memcpy(sctx->buf + partial, data, len);
+	} else
+		__sha512_sparc64_update(sctx, data, len, partial);
+
+	return 0;
+}
+
+static int sha512_sparc64_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be64 *dst = (__be64 *)out;
+	__be64 bits[2];
+	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+	/* Save number of bits */
+	bits[1] = cpu_to_be64(sctx->count[0] << 3);
+	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
+
+	/* Pad out to 112 mod 128 and append length */
+	index = sctx->count[0] % SHA512_BLOCK_SIZE;
+	padlen = (index < 112) ? (112 - index) : ((SHA512_BLOCK_SIZE+112) - index);
+
+	/* We need to fill a whole block for __sha512_sparc64_update() */
+	if (padlen <= 112) {
+		if ((sctx->count[0] += padlen) < padlen)
+			sctx->count[1]++;
+		memcpy(sctx->buf + index, padding, padlen);
+	} else {
+		__sha512_sparc64_update(sctx, padding, padlen, index);
+	}
+	__sha512_sparc64_update(sctx, (const u8 *)&bits, sizeof(bits), 112);
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be64(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha384_sparc64_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[64];
+
+	sha512_sparc64_final(desc, D);
+
+	memcpy(hash, D, 48);
+	memset(D, 0, 64);
+
+	return 0;
+}
+
+static struct shash_alg sha512 = {
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.init		=	sha512_sparc64_init,
+	.update		=	sha512_sparc64_update,
+	.final		=	sha512_sparc64_final,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha512",
+		.cra_driver_name=	"sha512-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static struct shash_alg sha384 = {
+	.digestsize	=	SHA384_DIGEST_SIZE,
+	.init		=	sha384_sparc64_init,
+	.update		=	sha512_sparc64_update,
+	.final		=	sha384_sparc64_final,
+	.descsize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha384",
+		.cra_driver_name=	"sha384-sparc64",
+		.cra_priority	=	SPARC_CR_OPCODE_PRIORITY,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+};
+
+static bool __init sparc64_has_sha512_opcode(void)
+{
+	unsigned long cfr;
+
+	if (!(sparc64_elf_hwcap & HWCAP_SPARC_CRYPTO))
+		return false;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+	if (!(cfr & CFR_SHA512))
+		return false;
+
+	return true;
+}
+
+static int __init sha512_sparc64_mod_init(void)
+{
+	if (sparc64_has_sha512_opcode()) {
+		int ret = crypto_register_shash(&sha384);
+		if (ret < 0)
+			return ret;
+
+		ret = crypto_register_shash(&sha512);
+		if (ret < 0) {
+			crypto_unregister_shash(&sha384);
+			return ret;
+		}
+
+		pr_info("Using sparc64 sha512 opcode optimized SHA-512/SHA-384 implementation\n");
+		return 0;
+	}
+	pr_info("sparc64 sha512 opcode not available.\n");
+	return -ENODEV;
+}
+
+static void __exit sha512_sparc64_mod_fini(void)
+{
+	crypto_unregister_shash(&sha384);
+	crypto_unregister_shash(&sha512);
+}
+
+module_init(sha512_sparc64_mod_init);
+module_exit(sha512_sparc64_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA-384 and SHA-512 Secure Hash Algorithm, sparc64 sha512 opcode accelerated");
+
+MODULE_ALIAS("sha384");
+MODULE_ALIAS("sha512");
diff --git a/arch/sparc/include/asm/asi.h b/arch/sparc/include/asm/asi.h
index 61ebe7411ceb..cc0006dc5d4a 100644
--- a/arch/sparc/include/asm/asi.h
+++ b/arch/sparc/include/asm/asi.h
@@ -141,7 +141,8 @@
 /* SpitFire and later extended ASIs.  The "(III)" marker designates
  * UltraSparc-III and later specific ASIs.  The "(CMT)" marker designates
  * Chip Multi Threading specific ASIs.  "(NG)" designates Niagara specific
- * ASIs, "(4V)" designates SUN4V specific ASIs.
+ * ASIs, "(4V)" designates SUN4V specific ASIs.  "(NG4)" designates SPARC-T4
+ * and later ASIs.
  */
 #define ASI_PHYS_USE_EC		0x14 /* PADDR, E-cachable		*/
 #define ASI_PHYS_BYPASS_EC_E	0x15 /* PADDR, E-bit			*/
@@ -243,6 +244,7 @@
 #define ASI_UDBL_CONTROL_R	0x7f /* External UDB control regs rd low*/
 #define ASI_INTR_R		0x7f /* IRQ vector dispatch read	*/
 #define ASI_INTR_DATAN_R	0x7f /* (III) In irq vector data reg N	*/
+#define ASI_PIC			0xb0 /* (NG4) PIC registers		*/
 #define ASI_PST8_P		0xc0 /* Primary, 8 8-bit, partial	*/
 #define ASI_PST8_S		0xc1 /* Secondary, 8 8-bit, partial	*/
 #define ASI_PST16_P		0xc2 /* Primary, 4 16-bit, partial	*/
diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h
index 7df8b7f544d4..370ca1e71ffb 100644
--- a/arch/sparc/include/asm/elf_64.h
+++ b/arch/sparc/include/asm/elf_64.h
@@ -86,6 +86,15 @@
 #define AV_SPARC_IMA		0x00400000 /* integer multiply-add */
 #define AV_SPARC_ASI_CACHE_SPARING \
 				0x00800000 /* cache sparing ASIs available */
+#define AV_SPARC_PAUSE		0x01000000 /* PAUSE available */
+#define AV_SPARC_CBCOND		0x02000000 /* CBCOND insns available */
+
+/* Solaris decided to enumerate every single crypto instruction type
+ * in the AT_HWCAP bits.  This is wasteful, since if crypto is present,
+ * you still need to look in the CFR register to see if the opcode is
+ * really available.  So we simply advertise only "crypto" support.
+ */
+#define HWCAP_SPARC_CRYPTO	0x04000000 /* CRYPTO insns available */
 
 #define CORE_DUMP_USE_REGSET
 
diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h
index 015a761eaa32..ca121f0fa3ec 100644
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -2934,6 +2934,16 @@ extern unsigned long sun4v_reboot_data_set(unsigned long ra,
 					   unsigned long len);
 #endif
 
+#define HV_FAST_VT_GET_PERFREG		0x184
+#define HV_FAST_VT_SET_PERFREG		0x185
+
+#ifndef __ASSEMBLY__
+extern unsigned long sun4v_vt_get_perfreg(unsigned long reg_num,
+					  unsigned long *reg_val);
+extern unsigned long sun4v_vt_set_perfreg(unsigned long reg_num,
+					  unsigned long reg_val);
+#endif
+
 /* Function numbers for HV_CORE_TRAP.  */
 #define HV_CORE_SET_VER			0x00
 #define HV_CORE_PUTCHAR			0x01
@@ -2964,6 +2974,7 @@ extern unsigned long sun4v_reboot_data_set(unsigned long ra,
 #define HV_GRP_NIU			0x0204
 #define HV_GRP_VF_CPU			0x0205
 #define HV_GRP_KT_CPU			0x0209
+#define HV_GRP_VT_CPU			0x020c
 #define HV_GRP_DIAG			0x0300
 
 #ifndef __ASSEMBLY__
diff --git a/arch/sparc/include/asm/mdesc.h b/arch/sparc/include/asm/mdesc.h
index 9faa046713fb..139097f3a67b 100644
--- a/arch/sparc/include/asm/mdesc.h
+++ b/arch/sparc/include/asm/mdesc.h
@@ -73,6 +73,7 @@ extern void mdesc_register_notifier(struct mdesc_notifier_client *client);
 
 extern void mdesc_fill_in_cpu_data(cpumask_t *mask);
 extern void mdesc_populate_present_mask(cpumask_t *mask);
+extern void mdesc_get_page_sizes(cpumask_t *mask, unsigned long *pgsz_mask);
 
 extern void sun4v_mdesc_init(void);
 
diff --git a/arch/sparc/include/asm/pcr.h b/arch/sparc/include/asm/pcr.h
index 288d7beba051..942bb17f60cd 100644
--- a/arch/sparc/include/asm/pcr.h
+++ b/arch/sparc/include/asm/pcr.h
@@ -2,8 +2,13 @@
 #define __PCR_H
 
 struct pcr_ops {
-	u64 (*read)(void);
-	void (*write)(u64);
+	u64 (*read_pcr)(unsigned long);
+	void (*write_pcr)(unsigned long, u64);
+	u64 (*read_pic)(unsigned long);
+	void (*write_pic)(unsigned long, u64);
+	u64 (*nmi_picl_value)(unsigned int nmi_hz);
+	u64 pcr_nmi_enable;
+	u64 pcr_nmi_disable;
 };
 extern const struct pcr_ops *pcr_ops;
 
@@ -27,21 +32,18 @@ extern void schedule_deferred_pcr_work(void);
 #define PCR_N2_SL1_SHIFT	27
 #define PCR_N2_OV1		0x80000000
 
-extern unsigned int picl_shift;
-
-/* In order to commonize as much of the implementation as
- * possible, we use PICH as our counter.  Mostly this is
- * to accommodate Niagara-1 which can only count insn cycles
- * in PICH.
- */
-static inline u64 picl_value(unsigned int nmi_hz)
-{
-	u32 delta = local_cpu_data().clock_tick / (nmi_hz << picl_shift);
-
-	return ((u64)((0 - delta) & 0xffffffff)) << 32;
-}
-
-extern u64 pcr_enable;
+#define PCR_N4_OV		0x00000001 /* PIC overflow             */
+#define PCR_N4_TOE		0x00000002 /* Trap On Event            */
+#define PCR_N4_UTRACE		0x00000004 /* Trace user events        */
+#define PCR_N4_STRACE		0x00000008 /* Trace supervisor events  */
+#define PCR_N4_HTRACE		0x00000010 /* Trace hypervisor events  */
+#define PCR_N4_MASK		0x000007e0 /* Event mask               */
+#define PCR_N4_MASK_SHIFT	5
+#define PCR_N4_SL		0x0000f800 /* Event Select             */
+#define PCR_N4_SL_SHIFT		11
+#define PCR_N4_PICNPT		0x00010000 /* PIC non-privileged trap  */
+#define PCR_N4_PICNHT		0x00020000 /* PIC non-hypervisor trap  */
+#define PCR_N4_NTC		0x00040000 /* Next-To-Commit wrap      */
 
 extern int pcr_arch_init(void);
 
diff --git a/arch/sparc/include/asm/perfctr.h b/arch/sparc/include/asm/perfctr.h
index 3332d2cba6c1..214feefa577c 100644
--- a/arch/sparc/include/asm/perfctr.h
+++ b/arch/sparc/include/asm/perfctr.h
@@ -54,11 +54,6 @@ enum perfctr_opcode {
 	PERFCTR_GETPCR
 };
 
-/* I don't want the kernel's namespace to be polluted with this
- * stuff when this file is included.  --DaveM
- */
-#ifndef __KERNEL__
-
 #define  PRIV 0x00000001
 #define  SYS  0x00000002
 #define  USR  0x00000004
@@ -168,29 +163,4 @@ struct vcounter_struct {
   unsigned long long vcnt1;
 };
 
-#else /* !(__KERNEL__) */
-
-#ifndef CONFIG_SPARC32
-
-/* Performance counter register access. */
-#define read_pcr(__p)  __asm__ __volatile__("rd	%%pcr, %0" : "=r" (__p))
-#define write_pcr(__p) __asm__ __volatile__("wr	%0, 0x0, %%pcr" : : "r" (__p))
-#define read_pic(__p)  __asm__ __volatile__("rd %%pic, %0" : "=r" (__p))
-
-/* Blackbird errata workaround.  See commentary in
- * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt()
- * for more information.
- */
-#define write_pic(__p)  					\
-	__asm__ __volatile__("ba,pt	%%xcc, 99f\n\t"		\
-			     " nop\n\t"				\
-			     ".align	64\n"			\
-			  "99:wr	%0, 0x0, %%pic\n\t"	\
-			     "rd	%%pic, %%g0" : : "r" (__p))
-#define reset_pic()	write_pic(0)
-
-#endif /* !CONFIG_SPARC32 */
-
-#endif /* !(__KERNEL__) */
-
 #endif /* !(PERF_COUNTER_API) */
diff --git a/arch/sparc/include/asm/pstate.h b/arch/sparc/include/asm/pstate.h
index a26a53777bb0..4b6b998afd99 100644
--- a/arch/sparc/include/asm/pstate.h
+++ b/arch/sparc/include/asm/pstate.h
@@ -88,4 +88,18 @@
 #define VERS_MAXTL	_AC(0x000000000000ff00,UL) /* Max Trap Level.	*/
 #define VERS_MAXWIN	_AC(0x000000000000001f,UL) /* Max RegWindow Idx.*/
 
+/* Compatability Feature Register (%asr26), SPARC-T4 and later  */
+#define CFR_AES		_AC(0x0000000000000001,UL) /* Supports AES opcodes     */
+#define CFR_DES		_AC(0x0000000000000002,UL) /* Supports DES opcodes     */
+#define CFR_KASUMI	_AC(0x0000000000000004,UL) /* Supports KASUMI opcodes  */
+#define CFR_CAMELLIA	_AC(0x0000000000000008,UL) /* Supports CAMELLIA opcodes*/
+#define CFR_MD5		_AC(0x0000000000000010,UL) /* Supports MD5 opcodes     */
+#define CFR_SHA1	_AC(0x0000000000000020,UL) /* Supports SHA1 opcodes    */
+#define CFR_SHA256	_AC(0x0000000000000040,UL) /* Supports SHA256 opcodes  */
+#define CFR_SHA512	_AC(0x0000000000000080,UL) /* Supports SHA512 opcodes  */
+#define CFR_MPMUL	_AC(0x0000000000000100,UL) /* Supports MPMUL opcodes   */
+#define CFR_MONTMUL	_AC(0x0000000000000200,UL) /* Supports MONTMUL opcodes */
+#define CFR_MONTSQR	_AC(0x0000000000000400,UL) /* Supports MONTSQR opcodes */
+#define CFR_CRC32C	_AC(0x0000000000000800,UL) /* Supports CRC32C opcodes  */
+
 #endif /* !(_SPARC64_PSTATE_H) */
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index b42ddbf9651e..ee5dcced2499 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -559,10 +559,10 @@ niagara_tlb_fixup:
 	be,pt	%xcc, niagara2_patch
 	 nop
 	cmp	%g1, SUN4V_CHIP_NIAGARA4
-	be,pt	%xcc, niagara2_patch
+	be,pt	%xcc, niagara4_patch
 	 nop
 	cmp	%g1, SUN4V_CHIP_NIAGARA5
-	be,pt	%xcc, niagara2_patch
+	be,pt	%xcc, niagara4_patch
 	 nop
 
 	call	generic_patch_copyops
@@ -573,6 +573,16 @@ niagara_tlb_fixup:
 	 nop
 
 	ba,a,pt	%xcc, 80f
+niagara4_patch:
+	call	niagara4_patch_copyops
+	 nop
+	call	niagara_patch_bzero
+	 nop
+	call	niagara4_patch_pageops
+	 nop
+
+	ba,a,pt	%xcc, 80f
+
 niagara2_patch:
 	call	niagara2_patch_copyops
 	 nop
diff --git a/arch/sparc/kernel/hvapi.c b/arch/sparc/kernel/hvapi.c
index 8593672838fd..1032df43ec95 100644
--- a/arch/sparc/kernel/hvapi.c
+++ b/arch/sparc/kernel/hvapi.c
@@ -45,6 +45,7 @@ static struct api_info api_table[] = {
 	{ .group = HV_GRP_NIU,					},
 	{ .group = HV_GRP_VF_CPU,				},
 	{ .group = HV_GRP_KT_CPU,				},
+	{ .group = HV_GRP_VT_CPU,				},
 	{ .group = HV_GRP_DIAG,		.flags = FLAG_PRE_API	},
 };
 
diff --git a/arch/sparc/kernel/hvcalls.S b/arch/sparc/kernel/hvcalls.S
index 58d60de4d65b..f3ab509b76a8 100644
--- a/arch/sparc/kernel/hvcalls.S
+++ b/arch/sparc/kernel/hvcalls.S
@@ -805,3 +805,19 @@ ENTRY(sun4v_reboot_data_set)
 	retl
 	 nop
 ENDPROC(sun4v_reboot_data_set)
+
+ENTRY(sun4v_vt_get_perfreg)
+	mov	%o1, %o4
+	mov	HV_FAST_VT_GET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	stx	%o1, [%o4]
+	retl
+	 nop
+ENDPROC(sun4v_vt_get_perfreg)
+
+ENTRY(sun4v_vt_set_perfreg)
+	mov	HV_FAST_VT_SET_PERFREG, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+ENDPROC(sun4v_vt_set_perfreg)
diff --git a/arch/sparc/kernel/ktlb.S b/arch/sparc/kernel/ktlb.S
index 79f310364849..0746e5e32b37 100644
--- a/arch/sparc/kernel/ktlb.S
+++ b/arch/sparc/kernel/ktlb.S
@@ -188,31 +188,26 @@ valid_addr_bitmap_patch:
 	be,pn		%xcc, kvmap_dtlb_longpath
 
 2:	 sethi		%hi(kpte_linear_bitmap), %g2
-	or		%g2, %lo(kpte_linear_bitmap), %g2
 
 	/* Get the 256MB physical address index. */
 	sllx		%g4, 21, %g5
-	mov		1, %g7
+	or		%g2, %lo(kpte_linear_bitmap), %g2
 	srlx		%g5, 21 + 28, %g5
+	and		%g5, (32 - 1), %g7
 
-	/* Don't try this at home kids... this depends upon srlx
-	 * only taking the low 6 bits of the shift count in %g5.
-	 */
-	sllx		%g7, %g5, %g7
-
-	/* Divide by 64 to get the offset into the bitmask.  */
-	srlx		%g5, 6, %g5
+	/* Divide by 32 to get the offset into the bitmask.  */
+	srlx		%g5, 5, %g5
+	add		%g7, %g7, %g7
 	sllx		%g5, 3, %g5
 
-	/* kern_linear_pte_xor[((mask & bit) ? 1 : 0)] */
+	/* kern_linear_pte_xor[(mask >> shift) & 3)] */
 	ldx		[%g2 + %g5], %g2
-	andcc		%g2, %g7, %g0
+	srlx		%g2, %g7, %g7
 	sethi		%hi(kern_linear_pte_xor), %g5
+	and		%g7, 3, %g7
 	or		%g5, %lo(kern_linear_pte_xor), %g5
-	bne,a,pt	%xcc, 1f
-	 add		%g5, 8, %g5
-
-1:	ldx		[%g5], %g2
+	sllx		%g7, 3, %g7
+	ldx		[%g5 + %g7], %g2
 
 	.globl		kvmap_linear_patch
 kvmap_linear_patch:
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 6dc796280589..831c001604e8 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -817,6 +817,30 @@ void __cpuinit mdesc_populate_present_mask(cpumask_t *mask)
 	mdesc_iterate_over_cpus(record_one_cpu, NULL, mask);
 }
 
+static void * __init check_one_pgsz(struct mdesc_handle *hp, u64 mp, int cpuid, void *arg)
+{
+	const u64 *pgsz_prop = mdesc_get_property(hp, mp, "mmu-page-size-list", NULL);
+	unsigned long *pgsz_mask = arg;
+	u64 val;
+
+	val = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K |
+	       HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB);
+	if (pgsz_prop)
+		val = *pgsz_prop;
+
+	if (!*pgsz_mask)
+		*pgsz_mask = val;
+	else
+		*pgsz_mask &= val;
+	return NULL;
+}
+
+void __init mdesc_get_page_sizes(cpumask_t *mask, unsigned long *pgsz_mask)
+{
+	*pgsz_mask = 0;
+	mdesc_iterate_over_cpus(check_one_pgsz, pgsz_mask, mask);
+}
+
 static void * __cpuinit fill_in_one_cpu(struct mdesc_handle *hp, u64 mp, int cpuid, void *arg)
 {
 	const u64 *cfreq = mdesc_get_property(hp, mp, "clock-frequency", NULL);
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index eb1c1f010a47..6479256fd5a4 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -22,7 +22,6 @@
 #include <asm/perf_event.h>
 #include <asm/ptrace.h>
 #include <asm/pcr.h>
-#include <asm/perfctr.h>
 
 #include "kstack.h"
 
@@ -109,7 +108,7 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 		       pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP)
 		touched = 1;
 	else
-		pcr_ops->write(PCR_PIC_PRIV);
+		pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
 
 	sum = local_cpu_data().irq0_irqs;
 	if (__get_cpu_var(nmi_touch)) {
@@ -126,8 +125,8 @@ notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs)
 		__this_cpu_write(alert_counter, 0);
 	}
 	if (__get_cpu_var(wd_enabled)) {
-		write_pic(picl_value(nmi_hz));
-		pcr_ops->write(pcr_enable);
+		pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
+		pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
 	}
 
 	restore_hardirq_stack(orig_sp);
@@ -166,7 +165,7 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
 
 void stop_nmi_watchdog(void *unused)
 {
-	pcr_ops->write(PCR_PIC_PRIV);
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
 	__get_cpu_var(wd_enabled) = 0;
 	atomic_dec(&nmi_active);
 }
@@ -223,10 +222,10 @@ void start_nmi_watchdog(void *unused)
 	__get_cpu_var(wd_enabled) = 1;
 	atomic_inc(&nmi_active);
 
-	pcr_ops->write(PCR_PIC_PRIV);
-	write_pic(picl_value(nmi_hz));
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
+	pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
 
-	pcr_ops->write(pcr_enable);
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
 }
 
 static void nmi_adjust_hz_one(void *unused)
@@ -234,10 +233,10 @@ static void nmi_adjust_hz_one(void *unused)
 	if (!__get_cpu_var(wd_enabled))
 		return;
 
-	pcr_ops->write(PCR_PIC_PRIV);
-	write_pic(picl_value(nmi_hz));
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
+	pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz));
 
-	pcr_ops->write(pcr_enable);
+	pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable);
 }
 
 void nmi_adjust_hz(unsigned int new_hz)
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 7661e84a05a0..051b69caeffd 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -594,7 +594,7 @@ static int __devinit pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
 		printk(KERN_ERR PFX "Strange virtual-dma[%08x:%08x].\n",
 		       vdma[0], vdma[1]);
 		return -EINVAL;
-	};
+	}
 
 	dma_mask = (roundup_pow_of_two(vdma[1]) - 1UL);
 	num_tsb_entries = vdma[1] / IO_PAGE_SIZE;
diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c
index 0ce0dd2332aa..269af58497aa 100644
--- a/arch/sparc/kernel/pcr.c
+++ b/arch/sparc/kernel/pcr.c
@@ -13,23 +13,14 @@
 #include <asm/pil.h>
 #include <asm/pcr.h>
 #include <asm/nmi.h>
+#include <asm/asi.h>
 #include <asm/spitfire.h>
-#include <asm/perfctr.h>
 
 /* This code is shared between various users of the performance
  * counters.  Users will be oprofile, pseudo-NMI watchdog, and the
  * perf_event support layer.
  */
 
-#define PCR_SUN4U_ENABLE	(PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE)
-#define PCR_N2_ENABLE		(PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE | \
-				 PCR_N2_TOE_OV1 | \
-				 (2 << PCR_N2_SL1_SHIFT) | \
-				 (0xff << PCR_N2_MASK1_SHIFT))
-
-u64 pcr_enable;
-unsigned int picl_shift;
-
 /* Performance counter interrupts run unmasked at PIL level 15.
  * Therefore we can't do things like wakeups and other work
  * that expects IRQ disabling to be adhered to in locking etc.
@@ -60,39 +51,144 @@ void arch_irq_work_raise(void)
 const struct pcr_ops *pcr_ops;
 EXPORT_SYMBOL_GPL(pcr_ops);
 
-static u64 direct_pcr_read(void)
+static u64 direct_pcr_read(unsigned long reg_num)
 {
 	u64 val;
 
-	read_pcr(val);
+	WARN_ON_ONCE(reg_num != 0);
+	__asm__ __volatile__("rd %%pcr, %0" : "=r" (val));
 	return val;
 }
 
-static void direct_pcr_write(u64 val)
+static void direct_pcr_write(unsigned long reg_num, u64 val)
+{
+	WARN_ON_ONCE(reg_num != 0);
+	__asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (val));
+}
+
+static u64 direct_pic_read(unsigned long reg_num)
 {
-	write_pcr(val);
+	u64 val;
+
+	WARN_ON_ONCE(reg_num != 0);
+	__asm__ __volatile__("rd %%pic, %0" : "=r" (val));
+	return val;
+}
+
+static void direct_pic_write(unsigned long reg_num, u64 val)
+{
+	WARN_ON_ONCE(reg_num != 0);
+
+	/* Blackbird errata workaround.  See commentary in
+	 * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt()
+	 * for more information.
+	 */
+	__asm__ __volatile__("ba,pt	%%xcc, 99f\n\t"
+			     " nop\n\t"
+			     ".align	64\n"
+			  "99:wr	%0, 0x0, %%pic\n\t"
+			     "rd	%%pic, %%g0" : : "r" (val));
+}
+
+static u64 direct_picl_value(unsigned int nmi_hz)
+{
+	u32 delta = local_cpu_data().clock_tick / nmi_hz;
+
+	return ((u64)((0 - delta) & 0xffffffff)) << 32;
 }
 
 static const struct pcr_ops direct_pcr_ops = {
-	.read	= direct_pcr_read,
-	.write	= direct_pcr_write,
+	.read_pcr		= direct_pcr_read,
+	.write_pcr		= direct_pcr_write,
+	.read_pic		= direct_pic_read,
+	.write_pic		= direct_pic_write,
+	.nmi_picl_value		= direct_picl_value,
+	.pcr_nmi_enable		= (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE),
+	.pcr_nmi_disable	= PCR_PIC_PRIV,
 };
 
-static void n2_pcr_write(u64 val)
+static void n2_pcr_write(unsigned long reg_num, u64 val)
 {
 	unsigned long ret;
 
+	WARN_ON_ONCE(reg_num != 0);
 	if (val & PCR_N2_HTRACE) {
 		ret = sun4v_niagara2_setperf(HV_N2_PERF_SPARC_CTL, val);
 		if (ret != HV_EOK)
-			write_pcr(val);
+			direct_pcr_write(reg_num, val);
 	} else
-		write_pcr(val);
+		direct_pcr_write(reg_num, val);
+}
+
+static u64 n2_picl_value(unsigned int nmi_hz)
+{
+	u32 delta = local_cpu_data().clock_tick / (nmi_hz << 2);
+
+	return ((u64)((0 - delta) & 0xffffffff)) << 32;
 }
 
 static const struct pcr_ops n2_pcr_ops = {
-	.read	= direct_pcr_read,
-	.write	= n2_pcr_write,
+	.read_pcr		= direct_pcr_read,
+	.write_pcr		= n2_pcr_write,
+	.read_pic		= direct_pic_read,
+	.write_pic		= direct_pic_write,
+	.nmi_picl_value		= n2_picl_value,
+	.pcr_nmi_enable		= (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE |
+				   PCR_N2_TOE_OV1 |
+				   (2 << PCR_N2_SL1_SHIFT) |
+				   (0xff << PCR_N2_MASK1_SHIFT)),
+	.pcr_nmi_disable	= PCR_PIC_PRIV,
+};
+
+static u64 n4_pcr_read(unsigned long reg_num)
+{
+	unsigned long val;
+
+	(void) sun4v_vt_get_perfreg(reg_num, &val);
+
+	return val;
+}
+
+static void n4_pcr_write(unsigned long reg_num, u64 val)
+{
+	(void) sun4v_vt_set_perfreg(reg_num, val);
+}
+
+static u64 n4_pic_read(unsigned long reg_num)
+{
+	unsigned long val;
+
+	__asm__ __volatile__("ldxa [%1] %2, %0"
+			     : "=r" (val)
+			     : "r" (reg_num * 0x8UL), "i" (ASI_PIC));
+
+	return val;
+}
+
+static void n4_pic_write(unsigned long reg_num, u64 val)
+{
+	__asm__ __volatile__("stxa %0, [%1] %2"
+			     : /* no outputs */
+			     : "r" (val), "r" (reg_num * 0x8UL), "i" (ASI_PIC));
+}
+
+static u64 n4_picl_value(unsigned int nmi_hz)
+{
+	u32 delta = local_cpu_data().clock_tick / (nmi_hz << 2);
+
+	return ((u64)((0 - delta) & 0xffffffff));
+}
+
+static const struct pcr_ops n4_pcr_ops = {
+	.read_pcr		= n4_pcr_read,
+	.write_pcr		= n4_pcr_write,
+	.read_pic		= n4_pic_read,
+	.write_pic		= n4_pic_write,
+	.nmi_picl_value		= n4_picl_value,
+	.pcr_nmi_enable		= (PCR_N4_PICNPT | PCR_N4_STRACE |
+				   PCR_N4_UTRACE | PCR_N4_TOE |
+				   (26 << PCR_N4_SL_SHIFT)),
+	.pcr_nmi_disable	= PCR_N4_PICNPT,
 };
 
 static unsigned long perf_hsvc_group;
@@ -115,6 +211,10 @@ static int __init register_perf_hsvc(void)
 			perf_hsvc_group = HV_GRP_KT_CPU;
 			break;
 
+		case SUN4V_CHIP_NIAGARA4:
+			perf_hsvc_group = HV_GRP_VT_CPU;
+			break;
+
 		default:
 			return -ENODEV;
 		}
@@ -139,6 +239,29 @@ static void __init unregister_perf_hsvc(void)
 	sun4v_hvapi_unregister(perf_hsvc_group);
 }
 
+static int __init setup_sun4v_pcr_ops(void)
+{
+	int ret = 0;
+
+	switch (sun4v_chip_type) {
+	case SUN4V_CHIP_NIAGARA1:
+	case SUN4V_CHIP_NIAGARA2:
+	case SUN4V_CHIP_NIAGARA3:
+		pcr_ops = &n2_pcr_ops;
+		break;
+
+	case SUN4V_CHIP_NIAGARA4:
+		pcr_ops = &n4_pcr_ops;
+		break;
+
+	default:
+		ret = -ENODEV;
+		break;
+	}
+
+	return ret;
+}
+
 int __init pcr_arch_init(void)
 {
 	int err = register_perf_hsvc();
@@ -148,15 +271,14 @@ int __init pcr_arch_init(void)
 
 	switch (tlb_type) {
 	case hypervisor:
-		pcr_ops = &n2_pcr_ops;
-		pcr_enable = PCR_N2_ENABLE;
-		picl_shift = 2;
+		err = setup_sun4v_pcr_ops();
+		if (err)
+			goto out_unregister;
 		break;
 
 	case cheetah:
 	case cheetah_plus:
 		pcr_ops = &direct_pcr_ops;
-		pcr_enable = PCR_SUN4U_ENABLE;
 		break;
 
 	case spitfire:
diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c
index 5713957dcb8a..e48651dace1b 100644
--- a/arch/sparc/kernel/perf_event.c
+++ b/arch/sparc/kernel/perf_event.c
@@ -25,36 +25,48 @@
 #include <linux/atomic.h>
 #include <asm/nmi.h>
 #include <asm/pcr.h>
-#include <asm/perfctr.h>
 #include <asm/cacheflush.h>
 
 #include "kernel.h"
 #include "kstack.h"
 
-/* Sparc64 chips have two performance counters, 32-bits each, with
- * overflow interrupts generated on transition from 0xffffffff to 0.
- * The counters are accessed in one go using a 64-bit register.
+/* Two classes of sparc64 chips currently exist.  All of which have
+ * 32-bit counters which can generate overflow interrupts on the
+ * transition from 0xffffffff to 0.
  *
- * Both counters are controlled using a single control register.  The
- * only way to stop all sampling is to clear all of the context (user,
- * supervisor, hypervisor) sampling enable bits.  But these bits apply
- * to both counters, thus the two counters can't be enabled/disabled
- * individually.
+ * All chips upto and including SPARC-T3 have two performance
+ * counters.  The two 32-bit counters are accessed in one go using a
+ * single 64-bit register.
  *
- * The control register has two event fields, one for each of the two
- * counters.  It's thus nearly impossible to have one counter going
- * while keeping the other one stopped.  Therefore it is possible to
- * get overflow interrupts for counters not currently "in use" and
- * that condition must be checked in the overflow interrupt handler.
+ * On these older chips both counters are controlled using a single
+ * control register.  The only way to stop all sampling is to clear
+ * all of the context (user, supervisor, hypervisor) sampling enable
+ * bits.  But these bits apply to both counters, thus the two counters
+ * can't be enabled/disabled individually.
+ *
+ * Furthermore, the control register on these older chips have two
+ * event fields, one for each of the two counters.  It's thus nearly
+ * impossible to have one counter going while keeping the other one
+ * stopped.  Therefore it is possible to get overflow interrupts for
+ * counters not currently "in use" and that condition must be checked
+ * in the overflow interrupt handler.
  *
  * So we use a hack, in that we program inactive counters with the
  * "sw_count0" and "sw_count1" events.  These count how many times
  * the instruction "sethi %hi(0xfc000), %g0" is executed.  It's an
  * unusual way to encode a NOP and therefore will not trigger in
  * normal code.
+ *
+ * Starting with SPARC-T4 we have one control register per counter.
+ * And the counters are stored in individual registers.  The registers
+ * for the counters are 64-bit but only a 32-bit counter is
+ * implemented.  The event selections on SPARC-T4 lack any
+ * restrictions, therefore we can elide all of the complicated
+ * conflict resolution code we have for SPARC-T3 and earlier chips.
  */
 
-#define MAX_HWEVENTS			2
+#define MAX_HWEVENTS			4
+#define MAX_PCRS			4
 #define MAX_PERIOD			((1UL << 32) - 1)
 
 #define PIC_UPPER_INDEX			0
@@ -90,8 +102,8 @@ struct cpu_hw_events {
 	 */
 	int			current_idx[MAX_HWEVENTS];
 
-	/* Software copy of %pcr register on this cpu.  */
-	u64			pcr;
+	/* Software copy of %pcr register(s) on this cpu.  */
+	u64			pcr[MAX_HWEVENTS];
 
 	/* Enabled/disable state.  */
 	int			enabled;
@@ -103,6 +115,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, };
 /* An event map describes the characteristics of a performance
  * counter event.  In particular it gives the encoding as well as
  * a mask telling which counters the event can be measured on.
+ *
+ * The mask is unused on SPARC-T4 and later.
  */
 struct perf_event_map {
 	u16	encoding;
@@ -142,15 +156,53 @@ struct sparc_pmu {
 	const struct perf_event_map	*(*event_map)(int);
 	const cache_map_t		*cache_map;
 	int				max_events;
+	u32				(*read_pmc)(int);
+	void				(*write_pmc)(int, u64);
 	int				upper_shift;
 	int				lower_shift;
 	int				event_mask;
+	int				user_bit;
+	int				priv_bit;
 	int				hv_bit;
 	int				irq_bit;
 	int				upper_nop;
 	int				lower_nop;
+	unsigned int			flags;
+#define SPARC_PMU_ALL_EXCLUDES_SAME	0x00000001
+#define SPARC_PMU_HAS_CONFLICTS		0x00000002
+	int				max_hw_events;
+	int				num_pcrs;
+	int				num_pic_regs;
 };
 
+static u32 sparc_default_read_pmc(int idx)
+{
+	u64 val;
+
+	val = pcr_ops->read_pic(0);
+	if (idx == PIC_UPPER_INDEX)
+		val >>= 32;
+
+	return val & 0xffffffff;
+}
+
+static void sparc_default_write_pmc(int idx, u64 val)
+{
+	u64 shift, mask, pic;
+
+	shift = 0;
+	if (idx == PIC_UPPER_INDEX)
+		shift = 32;
+
+	mask = ((u64) 0xffffffff) << shift;
+	val <<= shift;
+
+	pic = pcr_ops->read_pic(0);
+	pic &= ~mask;
+	pic |= val;
+	pcr_ops->write_pic(0, pic);
+}
+
 static const struct perf_event_map ultra3_perfmon_event_map[] = {
 	[PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER },
 	[PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER },
@@ -268,11 +320,20 @@ static const struct sparc_pmu ultra3_pmu = {
 	.event_map	= ultra3_event_map,
 	.cache_map	= &ultra3_cache_map,
 	.max_events	= ARRAY_SIZE(ultra3_perfmon_event_map),
+	.read_pmc	= sparc_default_read_pmc,
+	.write_pmc	= sparc_default_write_pmc,
 	.upper_shift	= 11,
 	.lower_shift	= 4,
 	.event_mask	= 0x3f,
+	.user_bit	= PCR_UTRACE,
+	.priv_bit	= PCR_STRACE,
 	.upper_nop	= 0x1c,
 	.lower_nop	= 0x14,
+	.flags		= (SPARC_PMU_ALL_EXCLUDES_SAME |
+			   SPARC_PMU_HAS_CONFLICTS),
+	.max_hw_events	= 2,
+	.num_pcrs	= 1,
+	.num_pic_regs	= 1,
 };
 
 /* Niagara1 is very limited.  The upper PIC is hard-locked to count
@@ -397,11 +458,20 @@ static const struct sparc_pmu niagara1_pmu = {
 	.event_map	= niagara1_event_map,
 	.cache_map	= &niagara1_cache_map,
 	.max_events	= ARRAY_SIZE(niagara1_perfmon_event_map),
+	.read_pmc	= sparc_default_read_pmc,
+	.write_pmc	= sparc_default_write_pmc,
 	.upper_shift	= 0,
 	.lower_shift	= 4,
 	.event_mask	= 0x7,
+	.user_bit	= PCR_UTRACE,
+	.priv_bit	= PCR_STRACE,
 	.upper_nop	= 0x0,
 	.lower_nop	= 0x0,
+	.flags		= (SPARC_PMU_ALL_EXCLUDES_SAME |
+			   SPARC_PMU_HAS_CONFLICTS),
+	.max_hw_events	= 2,
+	.num_pcrs	= 1,
+	.num_pic_regs	= 1,
 };
 
 static const struct perf_event_map niagara2_perfmon_event_map[] = {
@@ -523,13 +593,203 @@ static const struct sparc_pmu niagara2_pmu = {
 	.event_map	= niagara2_event_map,
 	.cache_map	= &niagara2_cache_map,
 	.max_events	= ARRAY_SIZE(niagara2_perfmon_event_map),
+	.read_pmc	= sparc_default_read_pmc,
+	.write_pmc	= sparc_default_write_pmc,
 	.upper_shift	= 19,
 	.lower_shift	= 6,
 	.event_mask	= 0xfff,
-	.hv_bit		= 0x8,
+	.user_bit	= PCR_UTRACE,
+	.priv_bit	= PCR_STRACE,
+	.hv_bit		= PCR_N2_HTRACE,
 	.irq_bit	= 0x30,
 	.upper_nop	= 0x220,
 	.lower_nop	= 0x220,
+	.flags		= (SPARC_PMU_ALL_EXCLUDES_SAME |
+			   SPARC_PMU_HAS_CONFLICTS),
+	.max_hw_events	= 2,
+	.num_pcrs	= 1,
+	.num_pic_regs	= 1,
+};
+
+static const struct perf_event_map niagara4_perfmon_event_map[] = {
+	[PERF_COUNT_HW_CPU_CYCLES] = { (26 << 6) },
+	[PERF_COUNT_HW_INSTRUCTIONS] = { (3 << 6) | 0x3f },
+	[PERF_COUNT_HW_CACHE_REFERENCES] = { (3 << 6) | 0x04 },
+	[PERF_COUNT_HW_CACHE_MISSES] = { (16 << 6) | 0x07 },
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { (4 << 6) | 0x01 },
+	[PERF_COUNT_HW_BRANCH_MISSES] = { (25 << 6) | 0x0f },
+};
+
+static const struct perf_event_map *niagara4_event_map(int event_id)
+{
+	return &niagara4_perfmon_event_map[event_id];
+}
+
+static const cache_map_t niagara4_cache_map = {
+[C(L1D)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x04 },
+		[C(RESULT_MISS)] = { (16 << 6) | 0x07 },
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x08 },
+		[C(RESULT_MISS)] = { (16 << 6) | 0x07 },
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(L1I)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x3f },
+		[C(RESULT_MISS)] = { (11 << 6) | 0x03 },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_NONSENSE },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_NONSENSE },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(LL)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x04 },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+	[C(OP_WRITE)] = {
+		[C(RESULT_ACCESS)] = { (3 << 6) | 0x08 },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+	[C(OP_PREFETCH)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(DTLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { (17 << 6) | 0x3f },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(ITLB)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { (6 << 6) | 0x3f },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(BPU)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+[C(NODE)] = {
+	[C(OP_READ)] = {
+		[C(RESULT_ACCESS)] = { CACHE_OP_UNSUPPORTED },
+		[C(RESULT_MISS)  ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = { CACHE_OP_UNSUPPORTED },
+		[ C(RESULT_MISS)   ] = { CACHE_OP_UNSUPPORTED },
+	},
+},
+};
+
+static u32 sparc_vt_read_pmc(int idx)
+{
+	u64 val = pcr_ops->read_pic(idx);
+
+	return val & 0xffffffff;
+}
+
+static void sparc_vt_write_pmc(int idx, u64 val)
+{
+	u64 pcr;
+
+	/* There seems to be an internal latch on the overflow event
+	 * on SPARC-T4 that prevents it from triggering unless you
+	 * update the PIC exactly as we do here.  The requirement
+	 * seems to be that you have to turn off event counting in the
+	 * PCR around the PIC update.
+	 *
+	 * For example, after the following sequence:
+	 *
+	 * 1) set PIC to -1
+	 * 2) enable event counting and overflow reporting in PCR
+	 * 3) overflow triggers, softint 15 handler invoked
+	 * 4) clear OV bit in PCR
+	 * 5) write PIC to -1
+	 *
+	 * a subsequent overflow event will not trigger.  This
+	 * sequence works on SPARC-T3 and previous chips.
+	 */
+	pcr = pcr_ops->read_pcr(idx);
+	pcr_ops->write_pcr(idx, PCR_N4_PICNPT);
+
+	pcr_ops->write_pic(idx, val & 0xffffffff);
+
+	pcr_ops->write_pcr(idx, pcr);
+}
+
+static const struct sparc_pmu niagara4_pmu = {
+	.event_map	= niagara4_event_map,
+	.cache_map	= &niagara4_cache_map,
+	.max_events	= ARRAY_SIZE(niagara4_perfmon_event_map),
+	.read_pmc	= sparc_vt_read_pmc,
+	.write_pmc	= sparc_vt_write_pmc,
+	.upper_shift	= 5,
+	.lower_shift	= 5,
+	.event_mask	= 0x7ff,
+	.user_bit	= PCR_N4_UTRACE,
+	.priv_bit	= PCR_N4_STRACE,
+
+	/* We explicitly don't support hypervisor tracing.  The T4
+	 * generates the overflow event for precise events via a trap
+	 * which will not be generated (ie. it's completely lost) if
+	 * we happen to be in the hypervisor when the event triggers.
+	 * Essentially, the overflow event reporting is completely
+	 * unusable when you have hypervisor mode tracing enabled.
+	 */
+	.hv_bit		= 0,
+
+	.irq_bit	= PCR_N4_TOE,
+	.upper_nop	= 0,
+	.lower_nop	= 0,
+	.flags		= 0,
+	.max_hw_events	= 4,
+	.num_pcrs	= 4,
+	.num_pic_regs	= 4,
 };
 
 static const struct sparc_pmu *sparc_pmu __read_mostly;
@@ -558,55 +818,35 @@ static u64 nop_for_index(int idx)
 static inline void sparc_pmu_enable_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc, int idx)
 {
 	u64 val, mask = mask_for_index(idx);
+	int pcr_index = 0;
 
-	val = cpuc->pcr;
+	if (sparc_pmu->num_pcrs > 1)
+		pcr_index = idx;
+
+	val = cpuc->pcr[pcr_index];
 	val &= ~mask;
 	val |= hwc->config;
-	cpuc->pcr = val;
+	cpuc->pcr[pcr_index] = val;
 
-	pcr_ops->write(cpuc->pcr);
+	pcr_ops->write_pcr(pcr_index, cpuc->pcr[pcr_index]);
 }
 
 static inline void sparc_pmu_disable_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc, int idx)
 {
 	u64 mask = mask_for_index(idx);
 	u64 nop = nop_for_index(idx);
+	int pcr_index = 0;
 	u64 val;
 
-	val = cpuc->pcr;
+	if (sparc_pmu->num_pcrs > 1)
+		pcr_index = idx;
+
+	val = cpuc->pcr[pcr_index];
 	val &= ~mask;
 	val |= nop;
-	cpuc->pcr = val;
+	cpuc->pcr[pcr_index] = val;
 
-	pcr_ops->write(cpuc->pcr);
-}
-
-static u32 read_pmc(int idx)
-{
-	u64 val;
-
-	read_pic(val);
-	if (idx == PIC_UPPER_INDEX)
-		val >>= 32;
-
-	return val & 0xffffffff;
-}
-
-static void write_pmc(int idx, u64 val)
-{
-	u64 shift, mask, pic;
-
-	shift = 0;
-	if (idx == PIC_UPPER_INDEX)
-		shift = 32;
-
-	mask = ((u64) 0xffffffff) << shift;
-	val <<= shift;
-
-	read_pic(pic);
-	pic &= ~mask;
-	pic |= val;
-	write_pic(pic);
+	pcr_ops->write_pcr(pcr_index, cpuc->pcr[pcr_index]);
 }
 
 static u64 sparc_perf_event_update(struct perf_event *event,
@@ -618,7 +858,7 @@ static u64 sparc_perf_event_update(struct perf_event *event,
 
 again:
 	prev_raw_count = local64_read(&hwc->prev_count);
-	new_raw_count = read_pmc(idx);
+	new_raw_count = sparc_pmu->read_pmc(idx);
 
 	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
 			     new_raw_count) != prev_raw_count)
@@ -658,25 +898,17 @@ static int sparc_perf_event_set_period(struct perf_event *event,
 
 	local64_set(&hwc->prev_count, (u64)-left);
 
-	write_pmc(idx, (u64)(-left) & 0xffffffff);
+	sparc_pmu->write_pmc(idx, (u64)(-left) & 0xffffffff);
 
 	perf_event_update_userpage(event);
 
 	return ret;
 }
 
-/* If performance event entries have been added, move existing
- * events around (if necessary) and then assign new entries to
- * counters.
- */
-static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
+static void read_in_all_counters(struct cpu_hw_events *cpuc)
 {
 	int i;
 
-	if (!cpuc->n_added)
-		goto out;
-
-	/* Read in the counters which are moving.  */
 	for (i = 0; i < cpuc->n_events; i++) {
 		struct perf_event *cp = cpuc->event[i];
 
@@ -687,6 +919,20 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
 			cpuc->current_idx[i] = PIC_NO_INDEX;
 		}
 	}
+}
+
+/* On this PMU all PICs are programmed using a single PCR.  Calculate
+ * the combined control register value.
+ *
+ * For such chips we require that all of the events have the same
+ * configuration, so just fetch the settings from the first entry.
+ */
+static void calculate_single_pcr(struct cpu_hw_events *cpuc)
+{
+	int i;
+
+	if (!cpuc->n_added)
+		goto out;
 
 	/* Assign to counters all unassigned events.  */
 	for (i = 0; i < cpuc->n_events; i++) {
@@ -702,20 +948,71 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr)
 		cpuc->current_idx[i] = idx;
 
 		enc = perf_event_get_enc(cpuc->events[i]);
-		pcr &= ~mask_for_index(idx);
+		cpuc->pcr[0] &= ~mask_for_index(idx);
 		if (hwc->state & PERF_HES_STOPPED)
-			pcr |= nop_for_index(idx);
+			cpuc->pcr[0] |= nop_for_index(idx);
 		else
-			pcr |= event_encoding(enc, idx);
+			cpuc->pcr[0] |= event_encoding(enc, idx);
 	}
 out:
-	return pcr;
+	cpuc->pcr[0] |= cpuc->event[0]->hw.config_base;
+}
+
+/* On this PMU each PIC has it's own PCR control register.  */
+static void calculate_multiple_pcrs(struct cpu_hw_events *cpuc)
+{
+	int i;
+
+	if (!cpuc->n_added)
+		goto out;
+
+	for (i = 0; i < cpuc->n_events; i++) {
+		struct perf_event *cp = cpuc->event[i];
+		struct hw_perf_event *hwc = &cp->hw;
+		int idx = hwc->idx;
+		u64 enc;
+
+		if (cpuc->current_idx[i] != PIC_NO_INDEX)
+			continue;
+
+		sparc_perf_event_set_period(cp, hwc, idx);
+		cpuc->current_idx[i] = idx;
+
+		enc = perf_event_get_enc(cpuc->events[i]);
+		cpuc->pcr[idx] &= ~mask_for_index(idx);
+		if (hwc->state & PERF_HES_STOPPED)
+			cpuc->pcr[idx] |= nop_for_index(idx);
+		else
+			cpuc->pcr[idx] |= event_encoding(enc, idx);
+	}
+out:
+	for (i = 0; i < cpuc->n_events; i++) {
+		struct perf_event *cp = cpuc->event[i];
+		int idx = cp->hw.idx;
+
+		cpuc->pcr[idx] |= cp->hw.config_base;
+	}
+}
+
+/* If performance event entries have been added, move existing events
+ * around (if necessary) and then assign new entries to counters.
+ */
+static void update_pcrs_for_enable(struct cpu_hw_events *cpuc)
+{
+	if (cpuc->n_added)
+		read_in_all_counters(cpuc);
+
+	if (sparc_pmu->num_pcrs == 1) {
+		calculate_single_pcr(cpuc);
+	} else {
+		calculate_multiple_pcrs(cpuc);
+	}
 }
 
 static void sparc_pmu_enable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	u64 pcr;
+	int i;
 
 	if (cpuc->enabled)
 		return;
@@ -723,26 +1020,17 @@ static void sparc_pmu_enable(struct pmu *pmu)
 	cpuc->enabled = 1;
 	barrier();
 
-	pcr = cpuc->pcr;
-	if (!cpuc->n_events) {
-		pcr = 0;
-	} else {
-		pcr = maybe_change_configuration(cpuc, pcr);
-
-		/* We require that all of the events have the same
-		 * configuration, so just fetch the settings from the
-		 * first entry.
-		 */
-		cpuc->pcr = pcr | cpuc->event[0]->hw.config_base;
-	}
+	if (cpuc->n_events)
+		update_pcrs_for_enable(cpuc);
 
-	pcr_ops->write(cpuc->pcr);
+	for (i = 0; i < sparc_pmu->num_pcrs; i++)
+		pcr_ops->write_pcr(i, cpuc->pcr[i]);
 }
 
 static void sparc_pmu_disable(struct pmu *pmu)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-	u64 val;
+	int i;
 
 	if (!cpuc->enabled)
 		return;
@@ -750,12 +1038,14 @@ static void sparc_pmu_disable(struct pmu *pmu)
 	cpuc->enabled = 0;
 	cpuc->n_added = 0;
 
-	val = cpuc->pcr;
-	val &= ~(PCR_UTRACE | PCR_STRACE |
-		 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
-	cpuc->pcr = val;
+	for (i = 0; i < sparc_pmu->num_pcrs; i++) {
+		u64 val = cpuc->pcr[i];
 
-	pcr_ops->write(cpuc->pcr);
+		val &= ~(sparc_pmu->user_bit | sparc_pmu->priv_bit |
+			 sparc_pmu->hv_bit | sparc_pmu->irq_bit);
+		cpuc->pcr[i] = val;
+		pcr_ops->write_pcr(i, cpuc->pcr[i]);
+	}
 }
 
 static int active_event_index(struct cpu_hw_events *cpuc,
@@ -854,9 +1144,11 @@ static DEFINE_MUTEX(pmc_grab_mutex);
 static void perf_stop_nmi_watchdog(void *unused)
 {
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+	int i;
 
 	stop_nmi_watchdog(NULL);
-	cpuc->pcr = pcr_ops->read();
+	for (i = 0; i < sparc_pmu->num_pcrs; i++)
+		cpuc->pcr[i] = pcr_ops->read_pcr(i);
 }
 
 void perf_event_grab_pmc(void)
@@ -942,9 +1234,17 @@ static int sparc_check_constraints(struct perf_event **evts,
 	if (!n_ev)
 		return 0;
 
-	if (n_ev > MAX_HWEVENTS)
+	if (n_ev > sparc_pmu->max_hw_events)
 		return -1;
 
+	if (!(sparc_pmu->flags & SPARC_PMU_HAS_CONFLICTS)) {
+		int i;
+
+		for (i = 0; i < n_ev; i++)
+			evts[i]->hw.idx = i;
+		return 0;
+	}
+
 	msk0 = perf_event_get_msk(events[0]);
 	if (n_ev == 1) {
 		if (msk0 & PIC_LOWER)
@@ -1000,6 +1300,9 @@ static int check_excludes(struct perf_event **evts, int n_prev, int n_new)
 	struct perf_event *event;
 	int i, n, first;
 
+	if (!(sparc_pmu->flags & SPARC_PMU_ALL_EXCLUDES_SAME))
+		return 0;
+
 	n = n_prev + n_new;
 	if (n <= 1)
 		return 0;
@@ -1059,7 +1362,7 @@ static int sparc_pmu_add(struct perf_event *event, int ef_flags)
 	perf_pmu_disable(event->pmu);
 
 	n0 = cpuc->n_events;
-	if (n0 >= MAX_HWEVENTS)
+	if (n0 >= sparc_pmu->max_hw_events)
 		goto out;
 
 	cpuc->event[n0] = event;
@@ -1146,16 +1449,16 @@ static int sparc_pmu_event_init(struct perf_event *event)
 	/* We save the enable bits in the config_base.  */
 	hwc->config_base = sparc_pmu->irq_bit;
 	if (!attr->exclude_user)
-		hwc->config_base |= PCR_UTRACE;
+		hwc->config_base |= sparc_pmu->user_bit;
 	if (!attr->exclude_kernel)
-		hwc->config_base |= PCR_STRACE;
+		hwc->config_base |= sparc_pmu->priv_bit;
 	if (!attr->exclude_hv)
 		hwc->config_base |= sparc_pmu->hv_bit;
 
 	n = 0;
 	if (event->group_leader != event) {
 		n = collect_events(event->group_leader,
-				   MAX_HWEVENTS - 1,
+				   sparc_pmu->max_hw_events - 1,
 				   evts, events, current_idx_dmy);
 		if (n < 0)
 			return -EINVAL;
@@ -1254,8 +1557,7 @@ static struct pmu pmu = {
 void perf_event_print_debug(void)
 {
 	unsigned long flags;
-	u64 pcr, pic;
-	int cpu;
+	int cpu, i;
 
 	if (!sparc_pmu)
 		return;
@@ -1264,12 +1566,13 @@ void perf_event_print_debug(void)
 
 	cpu = smp_processor_id();
 
-	pcr = pcr_ops->read();
-	read_pic(pic);
-
 	pr_info("\n");
-	pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n",
-		cpu, pcr, pic);
+	for (i = 0; i < sparc_pmu->num_pcrs; i++)
+		pr_info("CPU#%d: PCR%d[%016llx]\n",
+			cpu, i, pcr_ops->read_pcr(i));
+	for (i = 0; i < sparc_pmu->num_pic_regs; i++)
+		pr_info("CPU#%d: PIC%d[%016llx]\n",
+			cpu, i, pcr_ops->read_pic(i));
 
 	local_irq_restore(flags);
 }
@@ -1305,8 +1608,9 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 	 * Do this before we peek at the counters to determine
 	 * overflow so we don't lose any events.
 	 */
-	if (sparc_pmu->irq_bit)
-		pcr_ops->write(cpuc->pcr);
+	if (sparc_pmu->irq_bit &&
+	    sparc_pmu->num_pcrs == 1)
+		pcr_ops->write_pcr(0, cpuc->pcr[0]);
 
 	for (i = 0; i < cpuc->n_events; i++) {
 		struct perf_event *event = cpuc->event[i];
@@ -1314,6 +1618,10 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self,
 		struct hw_perf_event *hwc;
 		u64 val;
 
+		if (sparc_pmu->irq_bit &&
+		    sparc_pmu->num_pcrs > 1)
+			pcr_ops->write_pcr(idx, cpuc->pcr[idx]);
+
 		hwc = &event->hw;
 		val = sparc_perf_event_update(event, hwc, idx);
 		if (val & (1ULL << 31))
@@ -1352,6 +1660,10 @@ static bool __init supported_pmu(void)
 		sparc_pmu = &niagara2_pmu;
 		return true;
 	}
+	if (!strcmp(sparc_pmu_type, "niagara4")) {
+		sparc_pmu = &niagara4_pmu;
+		return true;
+	}
 	return false;
 }
 
diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c
index 1414d16712b2..0800e71d8a88 100644
--- a/arch/sparc/kernel/setup_64.c
+++ b/arch/sparc/kernel/setup_64.c
@@ -340,7 +340,12 @@ static const char *hwcaps[] = {
 	 */
 	"mul32", "div32", "fsmuld", "v8plus", "popc", "vis", "vis2",
 	"ASIBlkInit", "fmaf", "vis3", "hpc", "random", "trans", "fjfmau",
-	"ima", "cspare",
+	"ima", "cspare", "pause", "cbcond",
+};
+
+static const char *crypto_hwcaps[] = {
+	"aes", "des", "kasumi", "camellia", "md5", "sha1", "sha256",
+	"sha512", "mpmul", "montmul", "montsqr", "crc32c",
 };
 
 void cpucap_info(struct seq_file *m)
@@ -357,27 +362,61 @@ void cpucap_info(struct seq_file *m)
 			printed++;
 		}
 	}
+	if (caps & HWCAP_SPARC_CRYPTO) {
+		unsigned long cfr;
+
+		__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+		for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) {
+			unsigned long bit = 1UL << i;
+			if (cfr & bit) {
+				seq_printf(m, "%s%s",
+					   printed ? "," : "", crypto_hwcaps[i]);
+				printed++;
+			}
+		}
+	}
 	seq_putc(m, '\n');
 }
 
+static void __init report_one_hwcap(int *printed, const char *name)
+{
+	if ((*printed) == 0)
+		printk(KERN_INFO "CPU CAPS: [");
+	printk(KERN_CONT "%s%s",
+	       (*printed) ? "," : "", name);
+	if (++(*printed) == 8) {
+		printk(KERN_CONT "]\n");
+		*printed = 0;
+	}
+}
+
+static void __init report_crypto_hwcaps(int *printed)
+{
+	unsigned long cfr;
+	int i;
+
+	__asm__ __volatile__("rd %%asr26, %0" : "=r" (cfr));
+
+	for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) {
+		unsigned long bit = 1UL << i;
+		if (cfr & bit)
+			report_one_hwcap(printed, crypto_hwcaps[i]);
+	}
+}
+
 static void __init report_hwcaps(unsigned long caps)
 {
 	int i, printed = 0;
 
-	printk(KERN_INFO "CPU CAPS: [");
 	for (i = 0; i < ARRAY_SIZE(hwcaps); i++) {
 		unsigned long bit = 1UL << i;
-		if (caps & bit) {
-			printk(KERN_CONT "%s%s",
-			       printed ? "," : "", hwcaps[i]);
-			if (++printed == 8) {
-				printk(KERN_CONT "]\n");
-				printk(KERN_INFO "CPU CAPS: [");
-				printed = 0;
-			}
-		}
+		if (caps & bit)
+			report_one_hwcap(&printed, hwcaps[i]);
 	}
-	printk(KERN_CONT "]\n");
+	if (caps & HWCAP_SPARC_CRYPTO)
+		report_crypto_hwcaps(&printed);
+	if (printed != 0)
+		printk(KERN_CONT "]\n");
 }
 
 static unsigned long __init mdesc_cpu_hwcap_list(void)
@@ -411,6 +450,10 @@ static unsigned long __init mdesc_cpu_hwcap_list(void)
 				break;
 			}
 		}
+		for (i = 0; i < ARRAY_SIZE(crypto_hwcaps); i++) {
+			if (!strcmp(prop, crypto_hwcaps[i]))
+				caps |= HWCAP_SPARC_CRYPTO;
+		}
 
 		plen = strlen(prop) + 1;
 		prop += plen;
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index dff4096f3dec..30f6ab51c551 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -32,6 +32,9 @@ lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o
 lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o
 lib-$(CONFIG_SPARC64) +=  NG2patch.o
 
+lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o
+lib-$(CONFIG_SPARC64) +=  NG4patch.o NG4copy_page.o
+
 lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
 lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
 
diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S
new file mode 100644
index 000000000000..fd9f903ffa32
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_from_user.S
@@ -0,0 +1,30 @@
+/* NG4copy_from_user.S: Niagara-4 optimized copy from userspace.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_LD(x)		\
+98:	x;			\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, __retl_one_asi;\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#define FUNC_NAME		NG4copy_from_user
+#define LOAD(type,addr,dest)	type##a [addr] %asi, dest
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, ___copy_in_user;		\
+	 nop
+#endif
+
+#include "NG4memcpy.S"
diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S
new file mode 100644
index 000000000000..f30ec10bbcac
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_page.S
@@ -0,0 +1,57 @@
+/* NG4copy_page.S: Niagara-4 optimized copy page.
+ *
+ * Copyright (C) 2012 (davem@davemloft.net)
+ */
+
+#include <asm/asi.h>
+#include <asm/page.h>
+
+	.text
+	.align		32
+
+	.register	%g2, #scratch
+	.register	%g3, #scratch
+
+	.globl		NG4copy_user_page
+NG4copy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
+	prefetch	[%o1 + 0x000], #n_reads_strong
+	prefetch	[%o1 + 0x040], #n_reads_strong
+	prefetch	[%o1 + 0x080], #n_reads_strong
+	prefetch	[%o1 + 0x0c0], #n_reads_strong
+	set		PAGE_SIZE, %g7
+	prefetch	[%o1 + 0x100], #n_reads_strong
+	prefetch	[%o1 + 0x140], #n_reads_strong
+	prefetch	[%o1 + 0x180], #n_reads_strong
+	prefetch	[%o1 + 0x1c0], #n_reads_strong
+1:
+	ldx		[%o1 + 0x00], %o2
+	subcc		%g7, 0x40, %g7
+	ldx		[%o1 + 0x08], %o3
+	ldx		[%o1 + 0x10], %o4
+	ldx		[%o1 + 0x18], %o5
+	ldx		[%o1 + 0x20], %g1
+	stxa		%o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x28], %g2
+	stxa		%o3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x30], %g3
+	stxa		%o4, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	ldx		[%o1 + 0x38], %o2
+	add		%o1, 0x40, %o1
+	stxa		%o5, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%g1, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%g2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%g3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	stxa		%o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 prefetch	[%o1 + 0x200], #n_reads_strong
+	retl
+	 membar		#StoreLoad | #StoreStore
+	.size		NG4copy_user_page,.-NG4copy_user_page
diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S
new file mode 100644
index 000000000000..9744c4540a8d
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_to_user.S
@@ -0,0 +1,39 @@
+/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_ST(x)		\
+98:	x;			\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, __retl_one_asi;\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
+#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
+#endif
+
+#define FUNC_NAME		NG4copy_to_user
+#define STORE(type,src,addr)	type##a src, [addr] %asi
+#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_AIUS
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, ___copy_in_user;		\
+	 nop
+#endif
+
+#include "NG4memcpy.S"
diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S
new file mode 100644
index 000000000000..9cf2ee01cee3
--- /dev/null
+++ b/arch/sparc/lib/NG4memcpy.S
@@ -0,0 +1,360 @@
+/* NG4memcpy.S: Niagara-4 optimized memcpy.
+ *
+ * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
+ */
+
+#ifdef __KERNEL__
+#include <asm/visasm.h>
+#include <asm/asi.h>
+#define GLOBAL_SPARE	%g7
+#else
+#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
+#define FPRS_FEF  0x04
+
+/* On T4 it is very expensive to access ASRs like %fprs and
+ * %asi, avoiding a read or a write can save ~50 cycles.
+ */
+#define FPU_ENTER			\
+	rd	%fprs, %o5;		\
+	andcc	%o5, FPRS_FEF, %g0;	\
+	be,a,pn	%icc, 999f;		\
+	 wr	%g0, FPRS_FEF, %fprs;	\
+	999:
+
+#ifdef MEMCPY_DEBUG
+#define VISEntryHalf FPU_ENTER; \
+		     clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#else
+#define VISEntryHalf FPU_ENTER
+#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
+#endif
+
+#define GLOBAL_SPARE	%g5
+#endif
+
+#ifndef STORE_ASI
+#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
+#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
+#else
+#define STORE_ASI	0x80		/* ASI_P */
+#endif
+#endif
+
+#ifndef EX_LD
+#define EX_LD(x)	x
+#endif
+
+#ifndef EX_ST
+#define EX_ST(x)	x
+#endif
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+#endif
+
+#ifndef LOAD
+#define LOAD(type,addr,dest)	type [addr], dest
+#endif
+
+#ifndef STORE
+#ifndef MEMCPY_DEBUG
+#define STORE(type,src,addr)	type src, [addr]
+#else
+#define STORE(type,src,addr)	type##a src, [addr] %asi
+#endif
+#endif
+
+#ifndef STORE_INIT
+#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI
+#endif
+
+#ifndef FUNC_NAME
+#define FUNC_NAME	NG4memcpy
+#endif
+#ifndef PREAMBLE
+#define PREAMBLE
+#endif
+
+#ifndef XCC
+#define XCC xcc
+#endif
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+	.text
+	.align		64
+
+	.globl	FUNC_NAME
+	.type	FUNC_NAME,#function
+FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
+#ifdef MEMCPY_DEBUG
+	wr		%g0, 0x80, %asi
+#endif
+	srlx		%o2, 31, %g2
+	cmp		%g2, 0
+	tne		%XCC, 5
+	PREAMBLE
+	mov		%o0, %o3
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 3
+	ble,pn		%icc, .Ltiny
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall
+	 or		%o0, %o1, %g2
+	cmp		%o2, 128
+	bl,pn		%icc, .Lmedium
+	 nop
+
+.Llarge:/* len >= 0x80 */
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 51f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+
+51:	LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
+	LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	/* Check if we can use the straight fully aligned
+	 * loop, or we require the alignaddr/faligndata variant.
+	 */
+	andcc		%o1, 0x7, %o5
+	bne,pn		%icc, .Llarge_src_unaligned
+	 sub		%g0, %o0, %g1
+
+	/* Legitimize the use of initializing stores by getting dest
+	 * to be 64-byte aligned.
+	 */
+	and		%g1, 0x3f, %g1
+	brz,pt		%g1, .Llarge_aligned
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
+	add		%o1, 8, %o1
+	subcc		%g1, 8, %g1
+	add		%o0, 8, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g2, %o0 - 0x08))
+
+.Llarge_aligned:
+	/* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x40, %o1
+	EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
+	EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
+	EX_ST(STORE_INIT(%g1, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
+	EX_ST(STORE_INIT(%o5, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g2, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(%g3, %o0))
+	add		%o0, 0x08, %o0
+	EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
+
+	membar		#StoreLoad | #StoreStore
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_noprefetch
+
+.Lexit:	retl
+	 mov		EX_RETVAL(%o3), %o0
+
+.Llarge_src_unaligned:
+	andn		%o2, 0x3f, %o4
+	sub		%o2, %o4, %o2
+	VISEntryHalf
+	alignaddr	%o1, %g0, %g1
+	add		%o1, %o4, %o1
+	EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
+1:	EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
+	subcc		%o4, 0x40, %o4
+	EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
+	EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
+	EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
+	EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
+	EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
+	EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
+	faligndata	%f0, %f2, %f16
+	EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
+	faligndata	%f2, %f4, %f18
+	add		%g1, 0x40, %g1
+	faligndata	%f4, %f6, %f20
+	faligndata	%f6, %f8, %f22
+	faligndata	%f8, %f10, %f24
+	faligndata	%f10, %f12, %f26
+	faligndata	%f12, %f14, %f28
+	faligndata	%f14, %f0, %f30
+	EX_ST(STORE(std, %f16, %o0 + 0x00))
+	EX_ST(STORE(std, %f18, %o0 + 0x08))
+	EX_ST(STORE(std, %f20, %o0 + 0x10))
+	EX_ST(STORE(std, %f22, %o0 + 0x18))
+	EX_ST(STORE(std, %f24, %o0 + 0x20))
+	EX_ST(STORE(std, %f26, %o0 + 0x28))
+	EX_ST(STORE(std, %f28, %o0 + 0x30))
+	EX_ST(STORE(std, %f30, %o0 + 0x38))
+	add		%o0, 0x40, %o0
+	bne,pt		%icc, 1b
+	 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
+	VISExitHalf
+
+	brz,pn		%o2, .Lexit
+	 cmp		%o2, 19
+	ble,pn		%icc, .Lsmall_unaligned
+	 nop
+	ba,a,pt		%icc, .Lmedium_unaligned
+
+.Lmedium:
+	LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
+	andcc		%g2, 0x7, %g0
+	bne,pn		%icc, .Lmedium_unaligned
+	 nop
+.Lmedium_noprefetch:
+	andncc		%o2, 0x20 - 1, %o5
+	be,pn		%icc, 2f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
+	EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
+	EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
+	add		%o1, 0x20, %o1
+	subcc		%o5, 0x20, %o5
+	EX_ST(STORE(stx, %g1, %o0 + 0x00))
+	EX_ST(STORE(stx, %g2, %o0 + 0x08))
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
+	EX_ST(STORE(stx, %o4, %o0 + 0x18))
+	bne,pt		%icc, 1b
+	 add		%o0, 0x20, %o0
+2:	andcc		%o2, 0x18, %o5
+	be,pt		%icc, 3f
+	 sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
+	add		%o1, 0x08, %o1
+	add		%o0, 0x08, %o0
+	subcc		%o5, 0x08, %o5
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stx, %g1, %o0 - 0x08))
+3:	brz,pt		%o2, .Lexit
+	 cmp		%o2, 0x04
+	bl,pn		%icc, .Ltiny
+	 nop
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	add		%o0, 0x04, %o0
+	subcc		%o2, 0x04, %o2
+	bne,pn		%icc, .Ltiny
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	ba,a,pt		%icc, .Lexit
+.Lmedium_unaligned:
+	/* First get dest 8 byte aligned.  */
+	sub		%g0, %o0, %g1
+	and		%g1, 0x7, %g1
+	brz,pt		%g1, 2f
+	 sub		%o2, %g1, %o2
+
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
+	add		%o1, 1, %o1
+	subcc		%g1, 1, %g1
+	add		%o0, 1, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g2, %o0 - 0x01))
+2:
+	and		%o1, 0x7, %g1
+	brz,pn		%g1, .Lmedium_noprefetch
+	 sll		%g1, 3, %g1
+	mov		64, %g2
+	sub		%g2, %g1, %g2
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
+	sllx		%o4, %g1, %o4
+	andn		%o2, 0x08 - 1, %o5
+	sub		%o2, %o5, %o2
+1:	EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
+	add		%o1, 0x08, %o1
+	subcc		%o5, 0x08, %o5
+	srlx		%g3, %g2, GLOBAL_SPARE
+	or		GLOBAL_SPARE, %o4, GLOBAL_SPARE
+	EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
+	add		%o0, 0x08, %o0
+	bne,pt		%icc, 1b
+	 sllx		%g3, %g1, %o4
+	srl		%g1, 3, %g1
+	add		%o1, %g1, %o1
+	brz,pn		%o2, .Lexit
+	 nop
+	ba,pt		%icc, .Lsmall_unaligned
+
+.Ltiny:
+	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x00))
+	EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
+	subcc		%o2, 1, %o2
+	be,pn		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x01))
+	EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
+	ba,pt		%icc, .Lexit
+	 EX_ST(STORE(stb, %g1, %o0 + 0x02))
+
+.Lsmall:
+	andcc		%g2, 0x3, %g0
+	bne,pn		%icc, .Lsmall_unaligned
+	 andn		%o2, 0x4 - 1, %o5
+	sub		%o2, %o5, %o2
+1:
+	EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
+	add		%o1, 0x04, %o1
+	subcc		%o5, 0x04, %o5
+	add		%o0, 0x04, %o0
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stw, %g1, %o0 - 0x04))
+	brz,pt		%o2, .Lexit
+	 nop
+	ba,a,pt		%icc, .Ltiny
+
+.Lsmall_unaligned:
+1:	EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
+	add		%o1, 1, %o1
+	add		%o0, 1, %o0
+	subcc		%o2, 1, %o2
+	bne,pt		%icc, 1b
+	 EX_ST(STORE(stb, %g1, %o0 - 0x01))
+	ba,a,pt		%icc, .Lexit
+	.size		FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S
new file mode 100644
index 000000000000..c21c34c61dda
--- /dev/null
+++ b/arch/sparc/lib/NG4patch.S
@@ -0,0 +1,43 @@
+/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant.
+ *
+ * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
+ */
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara4_patch_copyops
+	.type	niagara4_patch_copyops,#function
+niagara4_patch_copyops:
+	NG_DO_PATCH(memcpy, NG4memcpy)
+	NG_DO_PATCH(___copy_from_user, NG4copy_from_user)
+	NG_DO_PATCH(___copy_to_user, NG4copy_to_user)
+	retl
+	 nop
+	.size	niagara4_patch_copyops,.-niagara4_patch_copyops
+
+	.globl	niagara4_patch_pageops
+	.type	niagara4_patch_pageops,#function
+niagara4_patch_pageops:
+	NG_DO_PATCH(copy_user_page, NG4copy_user_page)
+	NG_DO_PATCH(_clear_page, NGclear_page)
+	NG_DO_PATCH(clear_user_page, NGclear_user_page)
+	retl
+	 nop
+	.size	niagara4_patch_pageops,.-niagara4_patch_pageops
diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S
index b9e790b9c6b8..423d46e2258b 100644
--- a/arch/sparc/lib/NGpage.S
+++ b/arch/sparc/lib/NGpage.S
@@ -59,6 +59,8 @@ NGcopy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
 	 restore
 
 	.align		32
+	.globl		NGclear_page
+	.globl		NGclear_user_page
 NGclear_page:		/* %o0=dest */
 NGclear_user_page:	/* %o0=dest, %o1=vaddr */
 	rd		%asi, %g3
diff --git a/arch/sparc/lib/ksyms.c b/arch/sparc/lib/ksyms.c
index 3b31218cafc6..ee31b884c61b 100644
--- a/arch/sparc/lib/ksyms.c
+++ b/arch/sparc/lib/ksyms.c
@@ -134,6 +134,10 @@ EXPORT_SYMBOL(copy_user_page);
 void VISenter(void);
 EXPORT_SYMBOL(VISenter);
 
+/* CRYPTO code needs this */
+void VISenterhalf(void);
+EXPORT_SYMBOL(VISenterhalf);
+
 extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
 extern void xor_vis_3(unsigned long, unsigned long *, unsigned long *,
 		unsigned long *);
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index d58edf5fefdb..696bb095e0fc 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -51,22 +51,40 @@
 
 #include "init_64.h"
 
-unsigned long kern_linear_pte_xor[2] __read_mostly;
+unsigned long kern_linear_pte_xor[4] __read_mostly;
 
-/* A bitmap, one bit for every 256MB of physical memory.  If the bit
- * is clear, we should use a 4MB page (via kern_linear_pte_xor[0]) else
- * if set we should use a 256MB page (via kern_linear_pte_xor[1]).
+/* A bitmap, two bits for every 256MB of physical memory.  These two
+ * bits determine what page size we use for kernel linear
+ * translations.  They form an index into kern_linear_pte_xor[].  The
+ * value in the indexed slot is XOR'd with the TLB miss virtual
+ * address to form the resulting TTE.  The mapping is:
+ *
+ *	0	==>	4MB
+ *	1	==>	256MB
+ *	2	==>	2GB
+ *	3	==>	16GB
+ *
+ * All sun4v chips support 256MB pages.  Only SPARC-T4 and later
+ * support 2GB pages, and hopefully future cpus will support the 16GB
+ * pages as well.  For slots 2 and 3, we encode a 256MB TTE xor there
+ * if these larger page sizes are not supported by the cpu.
+ *
+ * It would be nice to determine this from the machine description
+ * 'cpu' properties, but we need to have this table setup before the
+ * MDESC is initialized.
  */
 unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
 
 #ifndef CONFIG_DEBUG_PAGEALLOC
-/* A special kernel TSB for 4MB and 256MB linear mappings.
- * Space is allocated for this right after the trap table
- * in arch/sparc64/kernel/head.S
+/* A special kernel TSB for 4MB, 256MB, 2GB and 16GB linear mappings.
+ * Space is allocated for this right after the trap table in
+ * arch/sparc64/kernel/head.S
  */
 extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
 #endif
 
+static unsigned long cpu_pgsz_mask;
+
 #define MAX_BANKS	32
 
 static struct linux_prom64_registers pavail[MAX_BANKS] __devinitdata;
@@ -403,6 +421,12 @@ EXPORT_SYMBOL(flush_icache_range);
 
 void mmu_info(struct seq_file *m)
 {
+	static const char *pgsz_strings[] = {
+		"8K", "64K", "512K", "4MB", "32MB",
+		"256MB", "2GB", "16GB",
+	};
+	int i, printed;
+
 	if (tlb_type == cheetah)
 		seq_printf(m, "MMU Type\t: Cheetah\n");
 	else if (tlb_type == cheetah_plus)
@@ -414,6 +438,17 @@ void mmu_info(struct seq_file *m)
 	else
 		seq_printf(m, "MMU Type\t: ???\n");
 
+	seq_printf(m, "MMU PGSZs\t: ");
+	printed = 0;
+	for (i = 0; i < ARRAY_SIZE(pgsz_strings); i++) {
+		if (cpu_pgsz_mask & (1UL << i)) {
+			seq_printf(m, "%s%s",
+				   printed ? "," : "", pgsz_strings[i]);
+			printed++;
+		}
+	}
+	seq_putc(m, '\n');
+
 #ifdef CONFIG_DEBUG_DCFLUSH
 	seq_printf(m, "DCPageFlushes\t: %d\n",
 		   atomic_read(&dcpage_flushes));
@@ -1358,32 +1393,75 @@ static unsigned long __ref kernel_map_range(unsigned long pstart,
 extern unsigned int kvmap_linear_patch[1];
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
-static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
+static void __init kpte_set_val(unsigned long index, unsigned long val)
 {
-	const unsigned long shift_256MB = 28;
-	const unsigned long mask_256MB = ((1UL << shift_256MB) - 1UL);
-	const unsigned long size_256MB = (1UL << shift_256MB);
+	unsigned long *ptr = kpte_linear_bitmap;
 
-	while (start < end) {
-		long remains;
+	val <<= ((index % (BITS_PER_LONG / 2)) * 2);
+	ptr += (index / (BITS_PER_LONG / 2));
 
-		remains = end - start;
-		if (remains < size_256MB)
-			break;
+	*ptr |= val;
+}
 
-		if (start & mask_256MB) {
-			start = (start + size_256MB) & ~mask_256MB;
-			continue;
-		}
+static const unsigned long kpte_shift_min = 28; /* 256MB */
+static const unsigned long kpte_shift_max = 34; /* 16GB */
+static const unsigned long kpte_shift_incr = 3;
+
+static unsigned long kpte_mark_using_shift(unsigned long start, unsigned long end,
+					   unsigned long shift)
+{
+	unsigned long size = (1UL << shift);
+	unsigned long mask = (size - 1UL);
+	unsigned long remains = end - start;
+	unsigned long val;
+
+	if (remains < size || (start & mask))
+		return start;
+
+	/* VAL maps:
+	 *
+	 *	shift 28 --> kern_linear_pte_xor index 1
+	 *	shift 31 --> kern_linear_pte_xor index 2
+	 *	shift 34 --> kern_linear_pte_xor index 3
+	 */
+	val = ((shift - kpte_shift_min) / kpte_shift_incr) + 1;
+
+	remains &= ~mask;
+	if (shift != kpte_shift_max)
+		remains = size;
 
-		while (remains >= size_256MB) {
-			unsigned long index = start >> shift_256MB;
+	while (remains) {
+		unsigned long index = start >> kpte_shift_min;
 
-			__set_bit(index, kpte_linear_bitmap);
+		kpte_set_val(index, val);
 
-			start += size_256MB;
-			remains -= size_256MB;
+		start += 1UL << kpte_shift_min;
+		remains -= 1UL << kpte_shift_min;
+	}
+
+	return start;
+}
+
+static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
+{
+	unsigned long smallest_size, smallest_mask;
+	unsigned long s;
+
+	smallest_size = (1UL << kpte_shift_min);
+	smallest_mask = (smallest_size - 1UL);
+
+	while (start < end) {
+		unsigned long orig_start = start;
+
+		for (s = kpte_shift_max; s >= kpte_shift_min; s -= kpte_shift_incr) {
+			start = kpte_mark_using_shift(start, end, s);
+
+			if (start != orig_start)
+				break;
 		}
+
+		if (start == orig_start)
+			start = (start + smallest_size) & ~smallest_mask;
 	}
 }
 
@@ -1577,13 +1655,16 @@ static void __init sun4v_ktsb_init(void)
 	ktsb_descr[0].resv = 0;
 
 #ifndef CONFIG_DEBUG_PAGEALLOC
-	/* Second KTSB for 4MB/256MB mappings.  */
+	/* Second KTSB for 4MB/256MB/2GB/16GB mappings.  */
 	ktsb_pa = (kern_base +
 		   ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
 
 	ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB;
-	ktsb_descr[1].pgsz_mask = (HV_PGSZ_MASK_4MB |
-				   HV_PGSZ_MASK_256MB);
+	ktsb_descr[1].pgsz_mask = ((HV_PGSZ_MASK_4MB |
+				    HV_PGSZ_MASK_256MB |
+				    HV_PGSZ_MASK_2GB |
+				    HV_PGSZ_MASK_16GB) &
+				   cpu_pgsz_mask);
 	ktsb_descr[1].assoc = 1;
 	ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES;
 	ktsb_descr[1].ctx_idx = 0;
@@ -1606,6 +1687,47 @@ void __cpuinit sun4v_ktsb_register(void)
 	}
 }
 
+static void __init sun4u_linear_pte_xor_finalize(void)
+{
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	/* This is where we would add Panther support for
+	 * 32MB and 256MB pages.
+	 */
+#endif
+}
+
+static void __init sun4v_linear_pte_xor_finalize(void)
+{
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	if (cpu_pgsz_mask & HV_PGSZ_MASK_256MB) {
+		kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
+			0xfffff80000000000UL;
+		kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+					   _PAGE_P_4V | _PAGE_W_4V);
+	} else {
+		kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
+	}
+
+	if (cpu_pgsz_mask & HV_PGSZ_MASK_2GB) {
+		kern_linear_pte_xor[2] = (_PAGE_VALID | _PAGE_SZ2GB_4V) ^
+			0xfffff80000000000UL;
+		kern_linear_pte_xor[2] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+					   _PAGE_P_4V | _PAGE_W_4V);
+	} else {
+		kern_linear_pte_xor[2] = kern_linear_pte_xor[1];
+	}
+
+	if (cpu_pgsz_mask & HV_PGSZ_MASK_16GB) {
+		kern_linear_pte_xor[3] = (_PAGE_VALID | _PAGE_SZ16GB_4V) ^
+			0xfffff80000000000UL;
+		kern_linear_pte_xor[3] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+					   _PAGE_P_4V | _PAGE_W_4V);
+	} else {
+		kern_linear_pte_xor[3] = kern_linear_pte_xor[2];
+	}
+#endif
+}
+
 /* paging_init() sets up the page tables */
 
 static unsigned long last_valid_pfn;
@@ -1665,10 +1787,8 @@ void __init paging_init(void)
 		ktsb_phys_patch();
 	}
 
-	if (tlb_type == hypervisor) {
+	if (tlb_type == hypervisor)
 		sun4v_patch_tlb_handlers();
-		sun4v_ktsb_init();
-	}
 
 	/* Find available physical memory...
 	 *
@@ -1727,9 +1847,6 @@ void __init paging_init(void)
 
 	__flush_tlb_all();
 
-	if (tlb_type == hypervisor)
-		sun4v_ktsb_register();
-
 	prom_build_devicetree();
 	of_populate_present_mask();
 #ifndef CONFIG_SMP
@@ -1742,8 +1859,36 @@ void __init paging_init(void)
 #ifndef CONFIG_SMP
 		mdesc_fill_in_cpu_data(cpu_all_mask);
 #endif
+		mdesc_get_page_sizes(cpu_all_mask, &cpu_pgsz_mask);
+
+		sun4v_linear_pte_xor_finalize();
+
+		sun4v_ktsb_init();
+		sun4v_ktsb_register();
+	} else {
+		unsigned long impl, ver;
+
+		cpu_pgsz_mask = (HV_PGSZ_MASK_8K | HV_PGSZ_MASK_64K |
+				 HV_PGSZ_MASK_512K | HV_PGSZ_MASK_4MB);
+
+		__asm__ __volatile__("rdpr %%ver, %0" : "=r" (ver));
+		impl = ((ver >> 32) & 0xffff);
+		if (impl == PANTHER_IMPL)
+			cpu_pgsz_mask |= (HV_PGSZ_MASK_32MB |
+					  HV_PGSZ_MASK_256MB);
+
+		sun4u_linear_pte_xor_finalize();
 	}
 
+	/* Flush the TLBs and the 4M TSB so that the updated linear
+	 * pte XOR settings are realized for all mappings.
+	 */
+	__flush_tlb_all();
+#ifndef CONFIG_DEBUG_PAGEALLOC
+	memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
+#endif
+	__flush_tlb_all();
+
 	/* Setup bootmem... */
 	last_valid_pfn = end_pfn = bootmem_init(phys_base);
 
@@ -2110,6 +2255,7 @@ static void __init sun4u_pgprot_init(void)
 {
 	unsigned long page_none, page_shared, page_copy, page_readonly;
 	unsigned long page_exec_bit;
+	int i;
 
 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
 				_PAGE_CACHE_4U | _PAGE_P_4U |
@@ -2137,8 +2283,8 @@ static void __init sun4u_pgprot_init(void)
 	kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
 				   _PAGE_P_4U | _PAGE_W_4U);
 
-	/* XXX Should use 256MB on Panther. XXX */
-	kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
+	for (i = 1; i < 4; i++)
+		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
 
 	_PAGE_SZBITS = _PAGE_SZBITS_4U;
 	_PAGE_ALL_SZ_BITS =  (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U |
@@ -2164,6 +2310,7 @@ static void __init sun4v_pgprot_init(void)
 {
 	unsigned long page_none, page_shared, page_copy, page_readonly;
 	unsigned long page_exec_bit;
+	int i;
 
 	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
 				_PAGE_CACHE_4V | _PAGE_P_4V |
@@ -2185,15 +2332,8 @@ static void __init sun4v_pgprot_init(void)
 	kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
 				   _PAGE_P_4V | _PAGE_W_4V);
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^
-		0xfffff80000000000UL;
-#else
-	kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
-		0xfffff80000000000UL;
-#endif
-	kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
-				   _PAGE_P_4V | _PAGE_W_4V);
+	for (i = 1; i < 4; i++)
+		kern_linear_pte_xor[i] = kern_linear_pte_xor[0];
 
 	pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V |
 		     __ACCESS_BITS_4V | _PAGE_E_4V);
diff --git a/arch/sparc/mm/init_64.h b/arch/sparc/mm/init_64.h
index 3e1ac8b96cae..0661aa606dec 100644
--- a/arch/sparc/mm/init_64.h
+++ b/arch/sparc/mm/init_64.h
@@ -8,12 +8,12 @@
 #define MAX_PHYS_ADDRESS	(1UL << 41UL)
 #define KPTE_BITMAP_CHUNK_SZ		(256UL * 1024UL * 1024UL)
 #define KPTE_BITMAP_BYTES	\
-	((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 8)
+	((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 4)
 #define VALID_ADDR_BITMAP_CHUNK_SZ	(4UL * 1024UL * 1024UL)
 #define VALID_ADDR_BITMAP_BYTES	\
 	((MAX_PHYS_ADDRESS / VALID_ADDR_BITMAP_CHUNK_SZ) / 8)
 
-extern unsigned long kern_linear_pte_xor[2];
+extern unsigned long kern_linear_pte_xor[4];
 extern unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
 extern unsigned int sparc64_highest_unlocked_tlb_ent;
 extern unsigned long sparc64_kern_pri_context;