summary refs log tree commit diff
path: root/arch/sparc64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc64')
-rw-r--r--arch/sparc64/Kconfig30
-rw-r--r--arch/sparc64/Kconfig.debug2
-rw-r--r--arch/sparc64/defconfig60
-rw-r--r--arch/sparc64/kernel/Makefile8
-rw-r--r--arch/sparc64/kernel/binfmt_aout32.c14
-rw-r--r--arch/sparc64/kernel/binfmt_elf32.c4
-rw-r--r--arch/sparc64/kernel/cpu.c7
-rw-r--r--arch/sparc64/kernel/devices.c189
-rw-r--r--arch/sparc64/kernel/dtlb_backend.S170
-rw-r--r--arch/sparc64/kernel/dtlb_base.S109
-rw-r--r--arch/sparc64/kernel/dtlb_miss.S39
-rw-r--r--arch/sparc64/kernel/ebus.c3
-rw-r--r--arch/sparc64/kernel/entry.S331
-rw-r--r--arch/sparc64/kernel/etrap.S170
-rw-r--r--arch/sparc64/kernel/head.S254
-rw-r--r--arch/sparc64/kernel/irq.c345
-rw-r--r--arch/sparc64/kernel/itlb_base.S79
-rw-r--r--arch/sparc64/kernel/itlb_miss.S39
-rw-r--r--arch/sparc64/kernel/kprobes.c69
-rw-r--r--arch/sparc64/kernel/ktlb.S363
-rw-r--r--arch/sparc64/kernel/pci.c15
-rw-r--r--arch/sparc64/kernel/pci_common.c301
-rw-r--r--arch/sparc64/kernel/pci_iommu.c36
-rw-r--r--arch/sparc64/kernel/pci_psycho.c23
-rw-r--r--arch/sparc64/kernel/pci_sabre.c23
-rw-r--r--arch/sparc64/kernel/pci_schizo.c24
-rw-r--r--arch/sparc64/kernel/pci_sun4v.c1147
-rw-r--r--arch/sparc64/kernel/pci_sun4v.h31
-rw-r--r--arch/sparc64/kernel/pci_sun4v_asm.S95
-rw-r--r--arch/sparc64/kernel/process.c133
-rw-r--r--arch/sparc64/kernel/ptrace.c3
-rw-r--r--arch/sparc64/kernel/rtrap.S115
-rw-r--r--arch/sparc64/kernel/sbus.c10
-rw-r--r--arch/sparc64/kernel/setup.c409
-rw-r--r--arch/sparc64/kernel/smp.c453
-rw-r--r--arch/sparc64/kernel/sparc64_ksyms.c41
-rw-r--r--arch/sparc64/kernel/sun4v_ivec.S334
-rw-r--r--arch/sparc64/kernel/sun4v_tlb_miss.S426
-rw-r--r--arch/sparc64/kernel/sys_sparc.c297
-rw-r--r--arch/sparc64/kernel/sys_sparc32.c83
-rw-r--r--arch/sparc64/kernel/systbls.S2
-rw-r--r--arch/sparc64/kernel/time.c389
-rw-r--r--arch/sparc64/kernel/trampoline.S238
-rw-r--r--arch/sparc64/kernel/traps.c439
-rw-r--r--arch/sparc64/kernel/tsb.S552
-rw-r--r--arch/sparc64/kernel/ttable.S63
-rw-r--r--arch/sparc64/kernel/unaligned.c45
-rw-r--r--arch/sparc64/kernel/us2e_cpufreq.c11
-rw-r--r--arch/sparc64/kernel/us3_cpufreq.c11
-rw-r--r--arch/sparc64/kernel/visemul.c894
-rw-r--r--arch/sparc64/kernel/vmlinux.lds.S16
-rw-r--r--arch/sparc64/kernel/winfixup.S480
-rw-r--r--arch/sparc64/lib/Makefile4
-rw-r--r--arch/sparc64/lib/NGbzero.S163
-rw-r--r--arch/sparc64/lib/NGcopy_from_user.S37
-rw-r--r--arch/sparc64/lib/NGcopy_to_user.S40
-rw-r--r--arch/sparc64/lib/NGmemcpy.S368
-rw-r--r--arch/sparc64/lib/NGpage.S96
-rw-r--r--arch/sparc64/lib/NGpatch.S33
-rw-r--r--arch/sparc64/lib/U3patch.S3
-rw-r--r--arch/sparc64/lib/bzero.S18
-rw-r--r--arch/sparc64/lib/clear_page.S12
-rw-r--r--arch/sparc64/lib/copy_page.S7
-rw-r--r--arch/sparc64/lib/delay.c19
-rw-r--r--arch/sparc64/lib/find_bit.c127
-rw-r--r--arch/sparc64/lib/xor.S300
-rw-r--r--arch/sparc64/math-emu/math.c24
-rw-r--r--arch/sparc64/mm/Makefile2
-rw-r--r--arch/sparc64/mm/fault.c24
-rw-r--r--arch/sparc64/mm/generic.c41
-rw-r--r--arch/sparc64/mm/hugetlbpage.c219
-rw-r--r--arch/sparc64/mm/init.c1452
-rw-r--r--arch/sparc64/mm/tlb.c64
-rw-r--r--arch/sparc64/mm/tsb.c500
-rw-r--r--arch/sparc64/mm/ultra.S374
-rw-r--r--arch/sparc64/prom/cif.S211
-rw-r--r--arch/sparc64/prom/console.c6
-rw-r--r--arch/sparc64/prom/init.c60
-rw-r--r--arch/sparc64/prom/misc.c44
-rw-r--r--arch/sparc64/prom/p1275.c11
-rw-r--r--arch/sparc64/prom/tree.c9
-rw-r--r--arch/sparc64/solaris/misc.c4
82 files changed, 10102 insertions, 3594 deletions
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index 4c0a50a76554..d1e2fc566486 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -162,6 +162,14 @@ config RWSEM_XCHGADD_ALGORITHM
 	bool
 	default y
 
+config GENERIC_FIND_NEXT_BIT
+	bool
+	default y
+
+config GENERIC_HWEIGHT
+	bool
+	default y if !ULTRA_HAS_POPULATION_COUNT
+
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -175,17 +183,26 @@ config HUGETLB_PAGE_SIZE_4MB
 	bool "4MB"
 
 config HUGETLB_PAGE_SIZE_512K
-	depends on !SPARC64_PAGE_SIZE_4MB
+	depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB
 	bool "512K"
 
 config HUGETLB_PAGE_SIZE_64K
-	depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB
+	depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB && !SPARC64_PAGE_SIZE_64K
 	bool "64K"
 
 endchoice
 
 endmenu
 
+config ARCH_SPARSEMEM_ENABLE
+	def_bool y
+
+config ARCH_SPARSEMEM_DEFAULT
+	def_bool y
+
+config LARGE_ALLOCS
+	def_bool y
+
 source "mm/Kconfig"
 
 config GENERIC_ISA_DMA
@@ -350,6 +367,15 @@ config SOLARIS_EMUL
 
 endmenu
 
+config SCHED_SMT
+	bool "SMT (Hyperthreading) scheduler support"
+	depends on SMP
+	default y
+	help
+	  SMT scheduler support improves the CPU scheduler's decision making
+	  when dealing with UltraSPARC cpus at a cost of slightly increased
+	  overhead in some places. If unsure say N here.
+
 config CMDLINE_BOOL
 	bool "Default bootloader kernel arguments"
 
diff --git a/arch/sparc64/Kconfig.debug b/arch/sparc64/Kconfig.debug
index 3e31be494e54..afe0a7720a26 100644
--- a/arch/sparc64/Kconfig.debug
+++ b/arch/sparc64/Kconfig.debug
@@ -24,7 +24,7 @@ config DEBUG_BOOTMEM
 	bool "Debug BOOTMEM initialization"
 
 config DEBUG_PAGEALLOC
-	bool "Page alloc debugging"
+	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND
 	help
 	  Unmap pages from the kernel linear mapping after free_pages().
diff --git a/arch/sparc64/defconfig b/arch/sparc64/defconfig
index 069d49777b2a..900fb0b940d8 100644
--- a/arch/sparc64/defconfig
+++ b/arch/sparc64/defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.16-rc2
-# Tue Feb  7 17:47:18 2006
+# Linux kernel version: 2.6.16
+# Sun Mar 26 14:58:11 2006
 #
 CONFIG_SPARC=y
 CONFIG_SPARC64=y
@@ -38,6 +38,7 @@ CONFIG_POSIX_MQUEUE=y
 CONFIG_SYSCTL=y
 # CONFIG_AUDIT is not set
 # CONFIG_IKCONFIG is not set
+CONFIG_RELAY=y
 CONFIG_INITRAMFS_SOURCE=""
 CONFIG_UID16=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
@@ -53,10 +54,6 @@ CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
 CONFIG_EPOLL=y
 CONFIG_SHMEM=y
-CONFIG_CC_ALIGN_FUNCTIONS=0
-CONFIG_CC_ALIGN_LABELS=0
-CONFIG_CC_ALIGN_LOOPS=0
-CONFIG_CC_ALIGN_JUMPS=0
 CONFIG_SLAB=y
 # CONFIG_TINY_SHMEM is not set
 CONFIG_BASE_SMALL=0
@@ -68,7 +65,6 @@ CONFIG_BASE_SMALL=0
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_FORCE_UNLOAD=y
-CONFIG_OBSOLETE_MODPARM=y
 CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_KMOD=y
@@ -76,6 +72,7 @@ CONFIG_KMOD=y
 #
 # Block layer
 #
+CONFIG_BLK_DEV_IO_TRACE=y
 
 #
 # IO Schedulers
@@ -111,17 +108,24 @@ CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
 CONFIG_US3_FREQ=m
 CONFIG_US2E_FREQ=m
 CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_HWEIGHT=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_HUGETLB_PAGE_SIZE_4MB=y
 # CONFIG_HUGETLB_PAGE_SIZE_512K is not set
 # CONFIG_HUGETLB_PAGE_SIZE_64K is not set
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_LARGE_ALLOCS=y
 CONFIG_SELECT_MEMORY_MODEL=y
-CONFIG_FLATMEM_MANUAL=y
+# CONFIG_FLATMEM_MANUAL is not set
 # CONFIG_DISCONTIGMEM_MANUAL is not set
-# CONFIG_SPARSEMEM_MANUAL is not set
-CONFIG_FLATMEM=y
-CONFIG_FLAT_NODE_MEM_MAP=y
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_HAVE_MEMORY_PRESENT=y
 # CONFIG_SPARSEMEM_STATIC is not set
+CONFIG_SPARSEMEM_EXTREME=y
+CONFIG_MEMORY_HOTPLUG=y
 CONFIG_SPLIT_PTLOCK_CPUS=4
 CONFIG_GENERIC_ISA_DMA=y
 CONFIG_SBUS=y
@@ -130,7 +134,6 @@ CONFIG_SUN_AUXIO=y
 CONFIG_SUN_IO=y
 CONFIG_PCI=y
 CONFIG_PCI_DOMAINS=y
-# CONFIG_PCI_LEGACY_PROC is not set
 # CONFIG_PCI_DEBUG is not set
 CONFIG_SUN_OPENPROMFS=m
 CONFIG_SPARC32_COMPAT=y
@@ -195,6 +198,8 @@ CONFIG_TCP_CONG_VEGAS=m
 CONFIG_TCP_CONG_SCALABLE=m
 CONFIG_IPV6=m
 CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
 CONFIG_INET6_AH=m
 CONFIG_INET6_ESP=m
 CONFIG_INET6_IPCOMP=m
@@ -207,10 +212,12 @@ CONFIG_IPV6_TUNNEL=m
 #
 CONFIG_IP_DCCP=m
 CONFIG_INET_DCCP_DIAG=m
+CONFIG_IP_DCCP_ACKVEC=y
 
 #
 # DCCP CCIDs Configuration (EXPERIMENTAL)
 #
+CONFIG_IP_DCCP_CCID2=m
 CONFIG_IP_DCCP_CCID3=m
 CONFIG_IP_DCCP_TFRC_LIB=m
 
@@ -218,7 +225,6 @@ CONFIG_IP_DCCP_TFRC_LIB=m
 # DCCP Kernel Hacking
 #
 # CONFIG_IP_DCCP_DEBUG is not set
-# CONFIG_IP_DCCP_UNLOAD_HACK is not set
 
 #
 # SCTP Configuration (EXPERIMENTAL)
@@ -303,6 +309,7 @@ CONFIG_BLK_DEV_NBD=m
 CONFIG_BLK_DEV_UB=m
 # CONFIG_BLK_DEV_RAM is not set
 CONFIG_BLK_DEV_RAM_COUNT=16
+# CONFIG_BLK_DEV_INITRD is not set
 CONFIG_CDROM_PKTCDVD=m
 CONFIG_CDROM_PKTCDVD_BUFFERS=8
 CONFIG_CDROM_PKTCDVD_WCACHE=y
@@ -655,6 +662,7 @@ CONFIG_SERIAL_SUNCORE=y
 CONFIG_SERIAL_SUNSU=y
 CONFIG_SERIAL_SUNSU_CONSOLE=y
 CONFIG_SERIAL_SUNSAB=m
+CONFIG_SERIAL_SUNHV=y
 CONFIG_SERIAL_CORE=y
 CONFIG_SERIAL_CORE_CONSOLE=y
 # CONFIG_SERIAL_JSM is not set
@@ -715,7 +723,6 @@ CONFIG_I2C_ALGOBIT=y
 # CONFIG_I2C_PARPORT_LIGHT is not set
 # CONFIG_I2C_PROSAVAGE is not set
 # CONFIG_I2C_SAVAGE4 is not set
-# CONFIG_SCx200_ACB is not set
 # CONFIG_I2C_SIS5595 is not set
 # CONFIG_I2C_SIS630 is not set
 # CONFIG_I2C_SIS96X is not set
@@ -801,10 +808,6 @@ CONFIG_HWMON=y
 #
 
 #
-# Multimedia Capabilities Port drivers
-#
-
-#
 # Multimedia devices
 #
 # CONFIG_VIDEO_DEV is not set
@@ -813,6 +816,7 @@ CONFIG_HWMON=y
 # Digital Video Broadcasting Devices
 #
 # CONFIG_DVB is not set
+# CONFIG_USB_DABUSB is not set
 
 #
 # Graphics support
@@ -894,10 +898,12 @@ CONFIG_SND_SEQ_DUMMY=m
 CONFIG_SND_OSSEMUL=y
 CONFIG_SND_MIXER_OSS=m
 CONFIG_SND_PCM_OSS=m
+CONFIG_SND_PCM_OSS_PLUGINS=y
 CONFIG_SND_SEQUENCER_OSS=y
 # CONFIG_SND_RTCTIMER is not set
 # CONFIG_SND_DYNAMIC_MINORS is not set
 CONFIG_SND_SUPPORT_OLD_API=y
+CONFIG_SND_VERBOSE_PROCFS=y
 # CONFIG_SND_VERBOSE_PRINTK is not set
 # CONFIG_SND_DEBUG is not set
 
@@ -980,6 +986,7 @@ CONFIG_SND_SUN_CS4231=m
 #
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
+CONFIG_USB_ARCH_HAS_EHCI=y
 CONFIG_USB=y
 # CONFIG_USB_DEBUG is not set
 
@@ -1007,7 +1014,6 @@ CONFIG_USB_UHCI_HCD=m
 #
 # USB Device Class drivers
 #
-# CONFIG_OBSOLETE_OSS_USB_DRIVER is not set
 # CONFIG_USB_ACM is not set
 # CONFIG_USB_PRINTER is not set
 
@@ -1051,15 +1057,6 @@ CONFIG_USB_HIDDEV=y
 # CONFIG_USB_MICROTEK is not set
 
 #
-# USB Multimedia devices
-#
-# CONFIG_USB_DABUSB is not set
-
-#
-# Video4Linux support is needed for USB Multimedia device support
-#
-
-#
 # USB Network Adapters
 #
 # CONFIG_USB_CATC is not set
@@ -1116,11 +1113,7 @@ CONFIG_USB_HIDDEV=y
 # CONFIG_INFINIBAND is not set
 
 #
-# SN Devices
-#
-
-#
-# EDAC - error detection and reporting (RAS)
+# EDAC - error detection and reporting (RAS) (EXPERIMENTAL)
 #
 
 #
@@ -1191,7 +1184,6 @@ CONFIG_TMPFS=y
 CONFIG_HUGETLBFS=y
 CONFIG_HUGETLB_PAGE=y
 CONFIG_RAMFS=y
-CONFIG_RELAYFS_FS=m
 # CONFIG_CONFIGFS_FS is not set
 
 #
diff --git a/arch/sparc64/kernel/Makefile b/arch/sparc64/kernel/Makefile
index 83d67eb18895..6f6816488b04 100644
--- a/arch/sparc64/kernel/Makefile
+++ b/arch/sparc64/kernel/Makefile
@@ -11,10 +11,12 @@ obj-y		:= process.o setup.o cpu.o idprom.o \
 		   traps.o devices.o auxio.o una_asm.o \
 		   irq.o ptrace.o time.o sys_sparc.o signal.o \
 		   unaligned.o central.o pci.o starfire.o semaphore.o \
-		   power.o sbus.o iommu_common.o sparc64_ksyms.o chmc.o
+		   power.o sbus.o iommu_common.o sparc64_ksyms.o chmc.o \
+		   visemul.o
 
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o pci_iommu.o \
-			    pci_psycho.o pci_sabre.o pci_schizo.o
+			    pci_psycho.o pci_sabre.o pci_schizo.o \
+			    pci_sun4v.o pci_sun4v_asm.o
 obj-$(CONFIG_SMP)	 += smp.o trampoline.o
 obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
@@ -38,5 +40,5 @@ else
   CMODEL_CFLAG := -m64 -mcmodel=medlow
 endif
 
-head.o: head.S ttable.S itlb_base.S dtlb_base.S dtlb_backend.S dtlb_prot.S \
+head.o: head.S ttable.S itlb_miss.S dtlb_miss.S ktlb.S tsb.S \
 	etrap.S rtrap.S winfixup.S entry.S
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index 202a80c24b6f..d7caa60a0074 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -31,6 +31,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
+#include <asm/mmu_context.h>
 
 static int load_aout32_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout32_library(struct file*);
@@ -238,6 +239,8 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		(current->mm->start_data = N_DATADDR(ex));
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
+	current->mm->free_area_cache = current->mm->mmap_base;
+	current->mm->cached_hole_size = 0;
 
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
@@ -329,15 +332,8 @@ beyond_if:
 
 	current->mm->start_stack =
 		(unsigned long) create_aout32_tables((char __user *)bprm->p, bprm);
-	if (!(orig_thr_flags & _TIF_32BIT)) {
-		unsigned long pgd_cache = get_pgd_cache(current->mm->pgd);
-
-		__asm__ __volatile__("stxa\t%0, [%1] %2\n\t"
-				     "membar #Sync"
-				     : /* no outputs */
-				     : "r" (pgd_cache),
-				       "r" (TSB_REG), "i" (ASI_DMMU));
-	}
+	tsb_context_switch(current->mm);
+
 	start_thread32(regs, ex.a_entry, current->mm->start_stack);
 	if (current->ptrace & PT_PTRACED)
 		send_sig(SIGTRAP, current, 0);
diff --git a/arch/sparc64/kernel/binfmt_elf32.c b/arch/sparc64/kernel/binfmt_elf32.c
index a1a12d2aa353..8a2abcce2737 100644
--- a/arch/sparc64/kernel/binfmt_elf32.c
+++ b/arch/sparc64/kernel/binfmt_elf32.c
@@ -153,7 +153,9 @@ MODULE_AUTHOR("Eric Youngdale, David S. Miller, Jakub Jelinek");
 #undef MODULE_DESCRIPTION
 #undef MODULE_AUTHOR
 
+#include <asm/a.out.h>
+
 #undef TASK_SIZE
-#define TASK_SIZE 0xf0000000
+#define TASK_SIZE STACK_TOP32
 
 #include "../../../fs/binfmt_elf.c"
diff --git a/arch/sparc64/kernel/cpu.c b/arch/sparc64/kernel/cpu.c
index 00eed88ef2e8..11cc0caef592 100644
--- a/arch/sparc64/kernel/cpu.c
+++ b/arch/sparc64/kernel/cpu.c
@@ -13,6 +13,7 @@
 #include <asm/system.h>
 #include <asm/fpumacro.h>
 #include <asm/cpudata.h>
+#include <asm/spitfire.h>
 
 DEFINE_PER_CPU(cpuinfo_sparc, __cpu_data) = { 0 };
 
@@ -71,6 +72,12 @@ void __init cpu_probe(void)
 	unsigned long ver, fpu_vers, manuf, impl, fprs;
 	int i;
 	
+	if (tlb_type == hypervisor) {
+		sparc_cpu_type = "UltraSparc T1 (Niagara)";
+		sparc_fpu_type = "UltraSparc T1 integrated FPU";
+		return;
+	}
+
 	fprs = fprs_read();
 	fprs_write(FPRS_FEF);
 	__asm__ __volatile__ ("rdpr %%ver, %0; stx %%fsr, [%1]"
diff --git a/arch/sparc64/kernel/devices.c b/arch/sparc64/kernel/devices.c
index df9a1ca8fd77..007e8922cd16 100644
--- a/arch/sparc64/kernel/devices.c
+++ b/arch/sparc64/kernel/devices.c
@@ -12,6 +12,7 @@
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
+#include <linux/bootmem.h>
 
 #include <asm/page.h>
 #include <asm/oplib.h>
@@ -20,6 +21,8 @@
 #include <asm/spitfire.h>
 #include <asm/timer.h>
 #include <asm/cpudata.h>
+#include <asm/vdev.h>
+#include <asm/irq.h>
 
 /* Used to synchronize acceses to NatSemi SUPER I/O chip configure
  * operations in asm/ns87303.h
@@ -29,13 +32,158 @@ DEFINE_SPINLOCK(ns87303_lock);
 extern void cpu_probe(void);
 extern void central_probe(void);
 
-static char *cpu_mid_prop(void)
+u32 sun4v_vdev_devhandle;
+int sun4v_vdev_root;
+
+struct vdev_intmap {
+	unsigned int phys;
+	unsigned int irq;
+	unsigned int cnode;
+	unsigned int cinterrupt;
+};
+
+struct vdev_intmask {
+	unsigned int phys;
+	unsigned int interrupt;
+	unsigned int __unused;
+};
+
+static struct vdev_intmap *vdev_intmap;
+static int vdev_num_intmap;
+static struct vdev_intmask vdev_intmask;
+
+static void __init sun4v_virtual_device_probe(void)
+{
+	struct linux_prom64_registers regs;
+	struct vdev_intmap *ip;
+	int node, sz, err;
+
+	if (tlb_type != hypervisor)
+		return;
+
+	node = prom_getchild(prom_root_node);
+	node = prom_searchsiblings(node, "virtual-devices");
+	if (!node) {
+		prom_printf("SUN4V: Fatal error, no virtual-devices node.\n");
+		prom_halt();
+	}
+
+	sun4v_vdev_root = node;
+
+	prom_getproperty(node, "reg", (char *)&regs, sizeof(regs));
+	sun4v_vdev_devhandle = (regs.phys_addr >> 32UL) & 0x0fffffff;
+
+	sz = prom_getproplen(node, "interrupt-map");
+	if (sz <= 0) {
+		prom_printf("SUN4V: Error, no vdev interrupt-map.\n");
+		prom_halt();
+	}
+
+	if ((sz % sizeof(*ip)) != 0) {
+		prom_printf("SUN4V: Bogus interrupt-map property size %d\n",
+			    sz);
+		prom_halt();
+	}
+
+	vdev_intmap = ip = alloc_bootmem_low_pages(sz);
+	if (!vdev_intmap) {
+		prom_printf("SUN4V: Error, cannot allocate vdev_intmap.\n");
+		prom_halt();
+	}
+
+	err = prom_getproperty(node, "interrupt-map", (char *) ip, sz);
+	if (err == -1) {
+		prom_printf("SUN4V: Fatal error, no vdev interrupt-map.\n");
+		prom_halt();
+	}
+	if (err != sz) {
+		prom_printf("SUN4V: Inconsistent interrupt-map size, "
+			    "proplen(%d) vs getprop(%d).\n", sz,err);
+		prom_halt();
+	}
+
+	vdev_num_intmap = err / sizeof(*ip);
+
+	err = prom_getproperty(node, "interrupt-map-mask",
+			       (char *) &vdev_intmask,
+			       sizeof(vdev_intmask));
+	if (err <= 0) {
+		prom_printf("SUN4V: Fatal error, no vdev "
+			    "interrupt-map-mask.\n");
+		prom_halt();
+	}
+	if (err % sizeof(vdev_intmask)) {
+		prom_printf("SUN4V: Bogus interrupt-map-mask "
+			    "property size %d\n", err);
+		prom_halt();
+	}
+
+	printk("SUN4V: virtual-devices devhandle[%x]\n",
+	       sun4v_vdev_devhandle);
+}
+
+unsigned int sun4v_vdev_device_interrupt(unsigned int dev_node)
+{
+	unsigned int irq, reg;
+	int err, i;
+
+	err = prom_getproperty(dev_node, "interrupts",
+			       (char *) &irq, sizeof(irq));
+	if (err <= 0) {
+		printk("VDEV: Cannot get \"interrupts\" "
+		       "property for OBP node %x\n", dev_node);
+		return 0;
+	}
+
+	err = prom_getproperty(dev_node, "reg",
+			       (char *) &reg, sizeof(reg));
+	if (err <= 0) {
+		printk("VDEV: Cannot get \"reg\" "
+		       "property for OBP node %x\n", dev_node);
+		return 0;
+	}
+
+	for (i = 0; i < vdev_num_intmap; i++) {
+		if (vdev_intmap[i].phys == (reg & vdev_intmask.phys) &&
+		    vdev_intmap[i].irq == (irq & vdev_intmask.interrupt)) {
+			irq = vdev_intmap[i].cinterrupt;
+			break;
+		}
+	}
+
+	if (i == vdev_num_intmap) {
+		printk("VDEV: No matching interrupt map entry "
+		       "for OBP node %x\n", dev_node);
+		return 0;
+	}
+
+	return sun4v_build_irq(sun4v_vdev_devhandle, irq, 5, 0);
+}
+
+static const char *cpu_mid_prop(void)
 {
 	if (tlb_type == spitfire)
 		return "upa-portid";
 	return "portid";
 }
 
+static int get_cpu_mid(int prom_node)
+{
+	if (tlb_type == hypervisor) {
+		struct linux_prom64_registers reg;
+
+		if (prom_getproplen(prom_node, "cpuid") == 4)
+			return prom_getintdefault(prom_node, "cpuid", 0);
+
+		prom_getproperty(prom_node, "reg", (char *) &reg, sizeof(reg));
+		return (reg.phys_addr >> 32) & 0x0fffffffUL;
+	} else {
+		const char *prop_name = cpu_mid_prop();
+
+		return prom_getintdefault(prom_node, prop_name, 0);
+	}
+}
+
 static int check_cpu_node(int nd, int *cur_inst,
 			  int (*compare)(int, int, void *), void *compare_arg,
 			  int *prom_node, int *mid)
@@ -50,7 +198,7 @@ static int check_cpu_node(int nd, int *cur_inst,
 		if (prom_node)
 			*prom_node = nd;
 		if (mid)
-			*mid = prom_getintdefault(nd, cpu_mid_prop(), 0);
+			*mid = get_cpu_mid(nd);
 		return 0;
 	}
 
@@ -105,7 +253,7 @@ static int cpu_mid_compare(int nd, int instance, void *_arg)
 	int desired_mid = (int) (long) _arg;
 	int this_mid;
 
-	this_mid = prom_getintdefault(nd, cpu_mid_prop(), 0);
+	this_mid = get_cpu_mid(nd);
 	if (this_mid == desired_mid)
 		return 0;
 	return -ENODEV;
@@ -126,7 +274,8 @@ void __init device_scan(void)
 
 #ifndef CONFIG_SMP
 	{
-		int err, cpu_node;
+		int err, cpu_node, def;
+
 		err = cpu_find_by_instance(0, &cpu_node, NULL);
 		if (err) {
 			prom_printf("No cpu nodes, cannot continue\n");
@@ -135,21 +284,40 @@ void __init device_scan(void)
 		cpu_data(0).clock_tick = prom_getintdefault(cpu_node,
 							    "clock-frequency",
 							    0);
+
+		def = ((tlb_type == hypervisor) ?
+		       (8 * 1024) :
+		       (16 * 1024));
 		cpu_data(0).dcache_size = prom_getintdefault(cpu_node,
 							     "dcache-size",
-							     16 * 1024);
+							     def);
+
+		def = 32;
 		cpu_data(0).dcache_line_size =
-			prom_getintdefault(cpu_node, "dcache-line-size", 32);
+			prom_getintdefault(cpu_node, "dcache-line-size",
+					   def);
+
+		def = 16 * 1024;
 		cpu_data(0).icache_size = prom_getintdefault(cpu_node,
 							     "icache-size",
-							     16 * 1024);
+							     def);
+
+		def = 32;
 		cpu_data(0).icache_line_size =
-			prom_getintdefault(cpu_node, "icache-line-size", 32);
+			prom_getintdefault(cpu_node, "icache-line-size",
+					   def);
+
+		def = ((tlb_type == hypervisor) ?
+		       (3 * 1024 * 1024) :
+		       (4 * 1024 * 1024));
 		cpu_data(0).ecache_size = prom_getintdefault(cpu_node,
 							     "ecache-size",
-							     4 * 1024 * 1024);
+							     def);
+
+		def = 64;
 		cpu_data(0).ecache_line_size =
-			prom_getintdefault(cpu_node, "ecache-line-size", 64);
+			prom_getintdefault(cpu_node, "ecache-line-size",
+					   def);
 		printk("CPU[0]: Caches "
 		       "D[sz(%d):line_sz(%d)] "
 		       "I[sz(%d):line_sz(%d)] "
@@ -160,6 +328,7 @@ void __init device_scan(void)
 	}
 #endif
 
+	sun4v_virtual_device_probe();
 	central_probe();
 
 	cpu_probe();
diff --git a/arch/sparc64/kernel/dtlb_backend.S b/arch/sparc64/kernel/dtlb_backend.S
deleted file mode 100644
index acc889a7f9c1..000000000000
--- a/arch/sparc64/kernel/dtlb_backend.S
+++ /dev/null
@@ -1,170 +0,0 @@
-/* $Id: dtlb_backend.S,v 1.16 2001/10/09 04:02:11 davem Exp $
- * dtlb_backend.S: Back end to DTLB miss replacement strategy.
- *                 This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-
-#define VALID_SZ_BITS	(_PAGE_VALID | _PAGE_SZBITS)
-
-#define VPTE_BITS		(_PAGE_CP | _PAGE_CV | _PAGE_P )
-#define VPTE_SHIFT		(PAGE_SHIFT - 3)
-
-/* Ways we can get here:
- *
- * 1) Nucleus loads and stores to/from PA-->VA direct mappings at tl>1.
- * 2) Nucleus loads and stores to/from user/kernel window save areas.
- * 3) VPTE misses from dtlb_base and itlb_base.
- *
- * We need to extract out the PMD and PGDIR indexes from the
- * linear virtual page table access address.  The PTE index
- * is at the bottom, but we are not concerned with it.  Bits
- * 0 to 2 are clear since each PTE is 8 bytes in size.  Each
- * PMD and PGDIR entry are 4 bytes in size.   Thus, this
- * address looks something like:
- *
- * |---------------------------------------------------------------|
- * |  ...   |    PGDIR index    |    PMD index    | PTE index  |   |
- * |---------------------------------------------------------------|
- *   63   F   E               D   C             B   A         3 2 0  <- bit nr
- *
- *  The variable bits above are defined as:
- *  A --> 3 + (PAGE_SHIFT - log2(8))
- *    --> 3 + (PAGE_SHIFT - 3) - 1
- *        (ie. this is "bit 3" + PAGE_SIZE - size of PTE entry in bits - 1)
- *  B --> A + 1
- *  C --> B + (PAGE_SHIFT - log2(4))
- *    -->  B + (PAGE_SHIFT - 2) - 1
- *        (ie. this is "bit B" + PAGE_SIZE - size of PMD entry in bits - 1)
- *  D --> C + 1
- *  E --> D + (PAGE_SHIFT - log2(4))
- *    --> D + (PAGE_SHIFT - 2) - 1
- *        (ie. this is "bit D" + PAGE_SIZE - size of PGDIR entry in bits - 1)
- *  F --> E + 1
- *
- * (Note how "B" always evalutes to PAGE_SHIFT, all the other constants
- *  cancel out.)
- *
- * For 8K PAGE_SIZE (thus, PAGE_SHIFT of 13) the bit numbers are:
- * A --> 12
- * B --> 13
- * C --> 23
- * D --> 24
- * E --> 34
- * F --> 35
- *
- * For 64K PAGE_SIZE (thus, PAGE_SHIFT of 16) the bit numbers are:
- * A --> 15
- * B --> 16
- * C --> 29
- * D --> 30
- * E --> 43
- * F --> 44
- *
- * Because bits both above and below each PGDIR and PMD index need to
- * be masked out, and the index can be as long as 14 bits (when using a
- * 64K PAGE_SIZE, and thus a PAGE_SHIFT of 16), we need 3 instructions
- * to extract each index out.
- *
- * Shifts do not pair very well on UltraSPARC-I, II, IIi, and IIe, so
- * we try to avoid using them for the entire operation.  We could setup
- * a mask anywhere from bit 31 down to bit 10 using the sethi instruction.
- *
- * We need a mask covering bits B --> C and one covering D --> E.
- * For 8K PAGE_SIZE these masks are 0x00ffe000 and 0x7ff000000.
- * For 64K PAGE_SIZE these masks are 0x3fff0000 and 0xfffc0000000.
- * The second in each set cannot be loaded with a single sethi
- * instruction, because the upper bits are past bit 32.  We would
- * need to use a sethi + a shift.
- *
- * For the time being, we use 2 shifts and a simple "and" mask.
- * We shift left to clear the bits above the index, we shift down
- * to clear the bits below the index (sans the log2(4 or 8) bits)
- * and a mask to clear the log2(4 or 8) bits.  We need therefore
- * define 4 shift counts, all of which are relative to PAGE_SHIFT.
- *
- * Although unsupportable for other reasons, this does mean that
- * 512K and 4MB page sizes would be generaally supported by the
- * kernel.  (ELF binaries would break with > 64K PAGE_SIZE since
- * the sections are only aligned that strongly).
- *
- * The operations performed for extraction are thus:
- *
- *      ((X << FOO_SHIFT_LEFT) >> FOO_SHIFT_RIGHT) & ~0x3
- *
- */
-
-#define A (3 + (PAGE_SHIFT - 3) - 1)
-#define B (A + 1)
-#define C (B + (PAGE_SHIFT - 2) - 1)
-#define D (C + 1)
-#define E (D + (PAGE_SHIFT - 2) - 1)
-#define F (E + 1)
-
-#define PMD_SHIFT_LEFT		(64 - D)
-#define PMD_SHIFT_RIGHT		(64 - (D - B) - 2)
-#define PGDIR_SHIFT_LEFT 	(64 - F)
-#define PGDIR_SHIFT_RIGHT	(64 - (F - D) - 2)
-#define LOW_MASK_BITS		0x3
-
-/* TLB1 ** ICACHE line 1: tl1 DTLB and quick VPTE miss	*/
-	ldxa		[%g1 + %g1] ASI_DMMU, %g4	! Get TAG_ACCESS
-	add		%g3, %g3, %g5			! Compute VPTE base
-	cmp		%g4, %g5			! VPTE miss?
-	bgeu,pt		%xcc, 1f			! Continue here
-	 andcc		%g4, TAG_CONTEXT_BITS, %g5	! tl0 miss Nucleus test
-	ba,a,pt		%xcc, from_tl1_trap		! Fall to tl0 miss
-1:	sllx		%g6, VPTE_SHIFT, %g4		! Position TAG_ACCESS
-	or		%g4, %g5, %g4			! Prepare TAG_ACCESS
-
-/* TLB1 ** ICACHE line 2: Quick VPTE miss	  	*/
-	mov		TSB_REG, %g1			! Grab TSB reg
-	ldxa		[%g1] ASI_DMMU, %g5		! Doing PGD caching?
-	sllx		%g6, PMD_SHIFT_LEFT, %g1	! Position PMD offset
-	be,pn		%xcc, sparc64_vpte_nucleus	! Is it from Nucleus?
-	 srlx		%g1, PMD_SHIFT_RIGHT, %g1	! Mask PMD offset bits
-	brnz,pt		%g5, sparc64_vpte_continue	! Yep, go like smoke
-	 andn		%g1, LOW_MASK_BITS, %g1		! Final PMD mask
-	sllx		%g6, PGDIR_SHIFT_LEFT, %g5	! Position PGD offset
-
-/* TLB1 ** ICACHE line 3: Quick VPTE miss	  	*/
-	srlx		%g5, PGDIR_SHIFT_RIGHT, %g5	! Mask PGD offset bits
-	andn		%g5, LOW_MASK_BITS, %g5		! Final PGD mask
-	lduwa		[%g7 + %g5] ASI_PHYS_USE_EC, %g5! Load PGD
-	brz,pn		%g5, vpte_noent			! Valid?
-sparc64_kpte_continue:
-	 sllx		%g5, 11, %g5			! Shift into place
-sparc64_vpte_continue:
-	lduwa		[%g5 + %g1] ASI_PHYS_USE_EC, %g5! Load PMD
-	sllx		%g5, 11, %g5			! Shift into place
-	brz,pn		%g5, vpte_noent			! Valid?
-
-/* TLB1 ** ICACHE line 4: Quick VPTE miss	  	*/
-	 mov		(VALID_SZ_BITS >> 61), %g1	! upper vpte into %g1
-	sllx		%g1, 61, %g1			! finish calc
-	or		%g5, VPTE_BITS, %g5		! Prepare VPTE data
-	or		%g5, %g1, %g5			! ...
-	mov		TLB_SFSR, %g1			! Restore %g1 value
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Load VPTE into TLB
-	stxa		%g4, [%g1 + %g1] ASI_DMMU	! Restore previous TAG_ACCESS
-	retry						! Load PTE once again
-
-#undef VALID_SZ_BITS
-#undef VPTE_SHIFT
-#undef VPTE_BITS
-#undef A
-#undef B
-#undef C
-#undef D
-#undef E
-#undef F
-#undef PMD_SHIFT_LEFT
-#undef PMD_SHIFT_RIGHT
-#undef PGDIR_SHIFT_LEFT
-#undef PGDIR_SHIFT_RIGHT
-#undef LOW_MASK_BITS
-
diff --git a/arch/sparc64/kernel/dtlb_base.S b/arch/sparc64/kernel/dtlb_base.S
deleted file mode 100644
index 6528786840c0..000000000000
--- a/arch/sparc64/kernel/dtlb_base.S
+++ /dev/null
@@ -1,109 +0,0 @@
-/* $Id: dtlb_base.S,v 1.17 2001/10/11 22:33:52 davem Exp $
- * dtlb_base.S:	Front end to DTLB miss replacement strategy.
- *              This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#include <asm/pgtable.h>
-#include <asm/mmu.h>
-
-/* %g1	TLB_SFSR	(%g1 + %g1 == TLB_TAG_ACCESS)
- * %g2	(KERN_HIGHBITS | KERN_LOWBITS)
- * %g3  VPTE base	(0xfffffffe00000000)	Spitfire/Blackbird (44-bit VA space)
- *			(0xffe0000000000000)	Cheetah		   (64-bit VA space)
- * %g7	__pa(current->mm->pgd)
- *
- * The VPTE base value is completely magic, but note that
- * few places in the kernel other than these TLB miss
- * handlers know anything about the VPTE mechanism or
- * how it works (see VPTE_SIZE, TASK_SIZE and PTRS_PER_PGD).
- * Consider the 44-bit VADDR Ultra-I/II case as an example:
- *
- * VA[0 :  (1<<43)] produce VPTE index [%g3                        :   0]
- * VA[0 : -(1<<43)] produce VPTE index [%g3-(1<<(43-PAGE_SHIFT+3)) : %g3]
- *
- * For Cheetah's 64-bit VADDR space this is:
- *
- * VA[0 :  (1<<63)] produce VPTE index [%g3                        :   0]
- * VA[0 : -(1<<63)] produce VPTE index [%g3-(1<<(63-PAGE_SHIFT+3)) : %g3]
- *
- * If you're paying attention you'll notice that this means half of
- * the VPTE table is above %g3 and half is below, low VA addresses
- * map progressively upwards from %g3, and high VA addresses map
- * progressively upwards towards %g3.  This trick was needed to make
- * the same 8 instruction handler work both for Spitfire/Blackbird's
- * peculiar VA space hole configuration and the full 64-bit VA space
- * one of Cheetah at the same time.
- */
-
-/* Ways we can get here:
- *
- * 1) Nucleus loads and stores to/from PA-->VA direct mappings.
- * 2) Nucleus loads and stores to/from vmalloc() areas.
- * 3) User loads and stores.
- * 4) User space accesses by nucleus at tl0
- */
-
-#if PAGE_SHIFT == 13
-/*
- * To compute vpte offset, we need to do ((addr >> 13) << 3),
- * which can be optimized to (addr >> 10) if bits 10/11/12 can
- * be guaranteed to be 0 ... mmu_context.h does guarantee this
- * by only using 10 bits in the hwcontext value.
- */
-#define CREATE_VPTE_OFFSET1(r1, r2) nop
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				srax	r1, 10, r2
-#else
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, PAGE_SHIFT, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				sllx	r2, 3, r2
-#endif
-
-/* DTLB ** ICACHE line 1: Quick user TLB misses		*/
-	mov		TLB_SFSR, %g1
-	ldxa		[%g1 + %g1] ASI_DMMU, %g4	! Get TAG_ACCESS
-	andcc		%g4, TAG_CONTEXT_BITS, %g0	! From Nucleus?
-from_tl1_trap:
-	rdpr		%tl, %g5			! For TL==3 test
-	CREATE_VPTE_OFFSET1(%g4, %g6)			! Create VPTE offset
-	be,pn		%xcc, kvmap			! Yep, special processing
-	 CREATE_VPTE_OFFSET2(%g4, %g6)			! Create VPTE offset
-	cmp		%g5, 4				! Last trap level?
-
-/* DTLB ** ICACHE line 2: User finish + quick kernel TLB misses	*/
-	be,pn		%xcc, longpath			! Yep, cannot risk VPTE miss
-	 nop						! delay slot
-	ldxa		[%g3 + %g6] ASI_S, %g5		! Load VPTE
-1:	brgez,pn	%g5, longpath			! Invalid, branch out
-	 nop						! Delay-slot
-9:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
-	retry						! Trap return
-	nop
-
-/* DTLB ** ICACHE line 3: winfixups+real_faults		*/
-longpath:
-	rdpr		%pstate, %g5			! Move into alternate globals
-	wrpr		%g5, PSTATE_AG|PSTATE_MG, %pstate
-	rdpr		%tl, %g4			! See where we came from.
-	cmp		%g4, 1				! Is etrap/rtrap window fault?
-	mov		TLB_TAG_ACCESS, %g4		! Prepare for fault processing
-	ldxa		[%g4] ASI_DMMU, %g5		! Load faulting VA page
-	be,pt		%xcc, sparc64_realfault_common	! Jump to normal fault handling
-	 mov		FAULT_CODE_DTLB, %g4		! It was read from DTLB
-
-/* DTLB ** ICACHE line 4: Unused...	*/
-	ba,a,pt		%xcc, winfix_trampoline		! Call window fixup code
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-
-#undef CREATE_VPTE_OFFSET1
-#undef CREATE_VPTE_OFFSET2
diff --git a/arch/sparc64/kernel/dtlb_miss.S b/arch/sparc64/kernel/dtlb_miss.S
new file mode 100644
index 000000000000..09a6a15a7105
--- /dev/null
+++ b/arch/sparc64/kernel/dtlb_miss.S
@@ -0,0 +1,39 @@
+/* DTLB ** ICACHE line 1: Context 0 check and TSB load	*/
+	ldxa	[%g0] ASI_DMMU_TSB_8KB_PTR, %g1	! Get TSB 8K pointer
+	ldxa	[%g0] ASI_DMMU, %g6		! Get TAG TARGET
+	srlx	%g6, 48, %g5			! Get context
+	sllx	%g6, 22, %g6			! Zero out context
+	brz,pn	%g5, kvmap_dtlb			! Context 0 processing
+	 srlx	%g6, 22, %g6			! Delay slot
+	TSB_LOAD_QUAD(%g1, %g4)			! Load TSB entry
+	cmp	%g4, %g6			! Compare TAG
+
+/* DTLB ** ICACHE line 2: TSB compare and TLB load	*/
+	bne,pn	%xcc, tsb_miss_dtlb		! Miss
+	 mov	FAULT_CODE_DTLB, %g3
+	stxa	%g5, [%g0] ASI_DTLB_DATA_IN	! Load TLB
+	retry					! Trap done
+	nop
+	nop
+	nop
+	nop
+
+/* DTLB ** ICACHE line 3:				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+/* DTLB ** ICACHE line 4: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
diff --git a/arch/sparc64/kernel/ebus.c b/arch/sparc64/kernel/ebus.c
index 7991e919d8ab..c69504aa638f 100644
--- a/arch/sparc64/kernel/ebus.c
+++ b/arch/sparc64/kernel/ebus.c
@@ -277,10 +277,9 @@ static inline void *ebus_alloc(size_t size)
 {
 	void *mem;
 
-	mem = kmalloc(size, GFP_ATOMIC);
+	mem = kzalloc(size, GFP_ATOMIC);
 	if (!mem)
 		panic("ebus_alloc: out of memory");
-	memset((char *)mem, 0, size);
 	return mem;
 }
 
diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
index a73553ae7e53..6d0b3ed77a02 100644
--- a/arch/sparc64/kernel/entry.S
+++ b/arch/sparc64/kernel/entry.S
@@ -50,7 +50,8 @@ do_fpdis:
 	add		%g0, %g0, %g0
 	ba,a,pt		%xcc, rtrap_clr_l6
 
-1:	ldub		[%g6 + TI_FPSAVED], %g5
+1:	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	ldub		[%g6 + TI_FPSAVED], %g5
 	wr		%g0, FPRS_FEF, %fprs
 	andcc		%g5, FPRS_FEF, %g0
 	be,a,pt		%icc, 1f
@@ -96,10 +97,22 @@ do_fpdis:
 	add		%g6, TI_FPREGS + 0x80, %g1
 	faddd		%f0, %f2, %f4
 	fmuld		%f0, %f2, %f6
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	add		%g6, TI_FPREGS + 0xc0, %g2
 	faddd		%f0, %f2, %f8
@@ -125,11 +138,23 @@ do_fpdis:
 	 fzero		%f32
 	mov		SECONDARY_CONTEXT, %g3
 	fzero		%f34
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	add		%g6, TI_FPREGS, %g1
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	add		%g6, TI_FPREGS + 0x40, %g2
 	faddd		%f32, %f34, %f36
@@ -154,10 +179,22 @@ do_fpdis:
 	 nop
 3:	mov		SECONDARY_CONTEXT, %g3
 	add		%g6, TI_FPREGS, %g1
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	mov		0x40, %g2
 	membar		#Sync
@@ -168,7 +205,13 @@ do_fpdis:
 	ldda		[%g1 + %g2] ASI_BLK_S, %f48
 	membar		#Sync
 fpdis_exit:
-	stxa		%g5, [%g3] ASI_DMMU
+
+661:	stxa		%g5, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g5, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 fpdis_exit2:
 	wr		%g7, 0, %gsr
@@ -189,6 +232,7 @@ fp_other_bounce:
 	.globl		do_fpother_check_fitos
 	.align		32
 do_fpother_check_fitos:
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
 	sethi		%hi(fp_other_bounce - 4), %g7
 	or		%g7, %lo(fp_other_bounce - 4), %g7
 
@@ -312,6 +356,7 @@ fitos_emul_fini:
 	.globl		do_fptrap
 	.align		32
 do_fptrap:
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
 	stx		%fsr, [%g6 + TI_XFSR]
 do_fptrap_after_fsr:
 	ldub		[%g6 + TI_FPSAVED], %g3
@@ -321,10 +366,22 @@ do_fptrap_after_fsr:
 	rd		%gsr, %g3
 	stx		%g3, [%g6 + TI_GSR]
 	mov		SECONDARY_CONTEXT, %g3
-	ldxa		[%g3] ASI_DMMU, %g5
+
+661:	ldxa		[%g3] ASI_DMMU, %g5
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	ldxa		[%g3] ASI_MMU, %g5
+	.previous
+
 	sethi		%hi(sparc64_kern_sec_context), %g2
 	ldx		[%g2 + %lo(sparc64_kern_sec_context)], %g2
-	stxa		%g2, [%g3] ASI_DMMU
+
+661:	stxa		%g2, [%g3] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g3] ASI_MMU
+	.previous
+
 	membar		#Sync
 	add		%g6, TI_FPREGS, %g2
 	andcc		%g1, FPRS_DL, %g0
@@ -339,7 +396,13 @@ do_fptrap_after_fsr:
 	stda		%f48, [%g2 + %g3] ASI_BLK_S
 5:	mov		SECONDARY_CONTEXT, %g1
 	membar		#Sync
-	stxa		%g5, [%g1] ASI_DMMU
+
+661:	stxa		%g5, [%g1] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g5, [%g1] ASI_MMU
+	.previous
+
 	membar		#Sync
 	ba,pt		%xcc, etrap
 	 wr		%g0, 0, %fprs
@@ -353,8 +416,6 @@ do_fptrap_after_fsr:
 	 *
 	 * With this method we can do most of the cross-call tlb/cache
 	 * flushing very quickly.
-	 *
-	 * Current CPU's IRQ worklist table is locked into %g6, don't touch.
 	 */
 	.text
 	.align		32
@@ -378,6 +439,8 @@ do_ivec:
 	sllx		%g2, %g4, %g2
 	sllx		%g4, 2, %g4
 
+	TRAP_LOAD_IRQ_WORK(%g6, %g1)
+
 	lduw		[%g6 + %g4], %g5	/* g5 = irq_work(cpu, pil) */
 	stw		%g5, [%g3 + 0x00]	/* bucket->irq_chain = g5 */
 	stw		%g3, [%g6 + %g4]	/* irq_work(cpu, pil) = bucket */
@@ -399,76 +462,6 @@ do_ivec_xcall:
 1:	jmpl		%g3, %g0
 	 nop
 
-	.globl		save_alternate_globals
-save_alternate_globals: /* %o0 = save_area */
-	rdpr		%pstate, %o5
-	andn		%o5, PSTATE_IE, %o1
-	wrpr		%o1, PSTATE_AG, %pstate
-	stx		%g0, [%o0 + 0x00]
-	stx		%g1, [%o0 + 0x08]
-	stx		%g2, [%o0 + 0x10]
-	stx		%g3, [%o0 + 0x18]
-	stx		%g4, [%o0 + 0x20]
-	stx		%g5, [%o0 + 0x28]
-	stx		%g6, [%o0 + 0x30]
-	stx		%g7, [%o0 + 0x38]
-	wrpr		%o1, PSTATE_IG, %pstate
-	stx		%g0, [%o0 + 0x40]
-	stx		%g1, [%o0 + 0x48]
-	stx		%g2, [%o0 + 0x50]
-	stx		%g3, [%o0 + 0x58]
-	stx		%g4, [%o0 + 0x60]
-	stx		%g5, [%o0 + 0x68]
-	stx		%g6, [%o0 + 0x70]
-	stx		%g7, [%o0 + 0x78]
-	wrpr		%o1, PSTATE_MG, %pstate
-	stx		%g0, [%o0 + 0x80]
-	stx		%g1, [%o0 + 0x88]
-	stx		%g2, [%o0 + 0x90]
-	stx		%g3, [%o0 + 0x98]
-	stx		%g4, [%o0 + 0xa0]
-	stx		%g5, [%o0 + 0xa8]
-	stx		%g6, [%o0 + 0xb0]
-	stx		%g7, [%o0 + 0xb8]
-	wrpr		%o5, 0x0, %pstate
-	retl
-	 nop
-
-	.globl		restore_alternate_globals
-restore_alternate_globals: /* %o0 = save_area */
-	rdpr		%pstate, %o5
-	andn		%o5, PSTATE_IE, %o1
-	wrpr		%o1, PSTATE_AG, %pstate
-	ldx		[%o0 + 0x00], %g0
-	ldx		[%o0 + 0x08], %g1
-	ldx		[%o0 + 0x10], %g2
-	ldx		[%o0 + 0x18], %g3
-	ldx		[%o0 + 0x20], %g4
-	ldx		[%o0 + 0x28], %g5
-	ldx		[%o0 + 0x30], %g6
-	ldx		[%o0 + 0x38], %g7
-	wrpr		%o1, PSTATE_IG, %pstate
-	ldx		[%o0 + 0x40], %g0
-	ldx		[%o0 + 0x48], %g1
-	ldx		[%o0 + 0x50], %g2
-	ldx		[%o0 + 0x58], %g3
-	ldx		[%o0 + 0x60], %g4
-	ldx		[%o0 + 0x68], %g5
-	ldx		[%o0 + 0x70], %g6
-	ldx		[%o0 + 0x78], %g7
-	wrpr		%o1, PSTATE_MG, %pstate
-	ldx		[%o0 + 0x80], %g0
-	ldx		[%o0 + 0x88], %g1
-	ldx		[%o0 + 0x90], %g2
-	ldx		[%o0 + 0x98], %g3
-	ldx		[%o0 + 0xa0], %g4
-	ldx		[%o0 + 0xa8], %g5
-	ldx		[%o0 + 0xb0], %g6
-	ldx		[%o0 + 0xb8], %g7
-	wrpr		%o5, 0x0, %pstate
-	retl
-	 nop
-
 	.globl		getcc, setcc
 getcc:
 	ldx		[%o0 + PT_V9_TSTATE], %o1
@@ -488,9 +481,24 @@ setcc:
 	retl
 	 stx		%o1, [%o0 + PT_V9_TSTATE]
 
-	.globl		utrap, utrap_ill
-utrap:	brz,pn		%g1, etrap
+	.globl		utrap_trap
+utrap_trap:		/* %g3=handler,%g4=level */
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	ldx		[%g6 + TI_UTRAPS], %g1
+	brnz,pt		%g1, invoke_utrap
 	 nop
+
+	ba,pt		%xcc, etrap
+	 rd		%pc, %g7
+	mov		%l4, %o1
+        call		bad_trap
+	 add		%sp, PTREGS_OFF, %o0
+	ba,pt		%xcc, rtrap
+	 clr		%l6
+
+invoke_utrap:
+	sllx		%g3, 3, %g3
+	ldx		[%g1 + %g3], %g1
 	save		%sp, -128, %sp
 	rdpr		%tstate, %l6
 	rdpr		%cwp, %l7
@@ -500,17 +508,6 @@ utrap:	brz,pn		%g1, etrap
 	rdpr		%tnpc, %l7
 	wrpr		%g1, 0, %tnpc
 	done
-utrap_ill:
-        call		bad_trap
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 clr		%l6
-
-	/* XXX Here is stuff we still need to write... -DaveM XXX */
-	.globl		netbsd_syscall
-netbsd_syscall:
-	retl
-	 nop
 
 	/* We need to carefully read the error status, ACK
 	 * the errors, prevent recursive traps, and pass the
@@ -1001,7 +998,7 @@ dcpe_icpe_tl1_common:
 	 * %g3:		scratch
 	 * %g4:		AFSR
 	 * %g5:		AFAR
-	 * %g6:		current thread ptr
+	 * %g6:		unused, will have current thread ptr after etrap
 	 * %g7:		scratch
 	 */
 __cheetah_log_error:
@@ -1539,13 +1536,14 @@ ret_from_syscall:
 
 1:		b,pt		%xcc, ret_sys_call
 		 ldx		[%sp + PTREGS_OFF + PT_V9_I0], %o0
-sparc_exit:	wrpr		%g0, (PSTATE_RMO | PSTATE_PEF | PSTATE_PRIV), %pstate
+sparc_exit:	rdpr		%pstate, %g2
+		wrpr		%g2, PSTATE_IE, %pstate
 		rdpr		%otherwin, %g1
 		rdpr		%cansave, %g3
 		add		%g3, %g1, %g3
 		wrpr		%g3, 0x0, %cansave
 		wrpr		%g0, 0x0, %otherwin
-		wrpr		%g0, (PSTATE_RMO | PSTATE_PEF | PSTATE_PRIV | PSTATE_IE), %pstate
+		wrpr		%g2, 0x0, %pstate
 		ba,pt		%xcc, sys_exit
 		 stb		%g0, [%g6 + TI_WSAVED]
 
@@ -1690,3 +1688,138 @@ __flushw_user:
 	 restore	%g0, %g0, %g0
 2:	retl
 	 nop
+
+#ifdef CONFIG_SMP
+	.globl		hard_smp_processor_id
+hard_smp_processor_id:
+#endif
+	.globl		real_hard_smp_processor_id
+real_hard_smp_processor_id:
+	__GET_CPUID(%o0)
+	retl
+	 nop
+
+	/* %o0: devhandle
+	 * %o1:	devino
+	 *
+	 * returns %o0: sysino
+	 */
+	.globl	sun4v_devino_to_sysino
+sun4v_devino_to_sysino:
+	mov	HV_FAST_INTR_DEVINO2SYSINO, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 *
+	 * returns %o0: intr_enabled (HV_INTR_{DISABLED,ENABLED})
+	 */
+	.globl	sun4v_intr_getenabled
+sun4v_intr_getenabled:
+	mov	HV_FAST_INTR_GETENABLED, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 * %o1: intr_enabled (HV_INTR_{DISABLED,ENABLED})
+	 */
+	.globl	sun4v_intr_setenabled
+sun4v_intr_setenabled:
+	mov	HV_FAST_INTR_SETENABLED, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0: sysino
+	 *
+	 * returns %o0: intr_state (HV_INTR_STATE_*)
+	 */
+	.globl	sun4v_intr_getstate
+sun4v_intr_getstate:
+	mov	HV_FAST_INTR_GETSTATE, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 * %o1: intr_state (HV_INTR_STATE_*)
+	 */
+	.globl	sun4v_intr_setstate
+sun4v_intr_setstate:
+	mov	HV_FAST_INTR_SETSTATE, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0: sysino
+	 *
+	 * returns %o0: cpuid
+	 */
+	.globl	sun4v_intr_gettarget
+sun4v_intr_gettarget:
+	mov	HV_FAST_INTR_GETTARGET, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: sysino
+	 * %o1: cpuid
+	 */
+	.globl	sun4v_intr_settarget
+sun4v_intr_settarget:
+	mov	HV_FAST_INTR_SETTARGET, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0:	type
+	 * %o1:	queue paddr
+	 * %o2:	num queue entries
+	 *
+	 * returns %o0:	status
+	 */
+	.globl	sun4v_cpu_qconf
+sun4v_cpu_qconf:
+	mov	HV_FAST_CPU_QCONF, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* returns %o0:	status
+	 */
+	.globl	sun4v_cpu_yield
+sun4v_cpu_yield:
+	mov	HV_FAST_CPU_YIELD, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0:	num cpus in cpu list
+	 * %o1:	cpu list paddr
+	 * %o2:	mondo block paddr
+	 *
+	 * returns %o0: status
+	 */
+	.globl	sun4v_cpu_mondo_send
+sun4v_cpu_mondo_send:
+	mov	HV_FAST_CPU_MONDO_SEND, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 nop
+
+	/* %o0:	CPU ID
+	 *
+	 * returns %o0:	-status if status non-zero, else
+	 *         %o0:	cpu state as HV_CPU_STATE_*
+	 */
+	.globl	sun4v_cpu_state
+sun4v_cpu_state:
+	mov	HV_FAST_CPU_STATE, %o5
+	ta	HV_FAST_TRAP
+	brnz,pn	%o0, 1f
+	 sub	%g0, %o0, %o0
+	mov	%o1, %o0
+1:	retl
+	 nop
diff --git a/arch/sparc64/kernel/etrap.S b/arch/sparc64/kernel/etrap.S
index 0d8eba21111b..149383835c25 100644
--- a/arch/sparc64/kernel/etrap.S
+++ b/arch/sparc64/kernel/etrap.S
@@ -31,6 +31,7 @@
 		.globl	etrap, etrap_irq, etraptl1
 etrap:		rdpr	%pil, %g2
 etrap_irq:
+		TRAP_LOAD_THREAD_REG(%g6, %g1)
 		rdpr	%tstate, %g1
 		sllx	%g2, 20, %g3
 		andcc	%g1, TSTATE_PRIV, %g0
@@ -54,7 +55,31 @@ etrap_irq:
 		rd	%y, %g3
 		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TNPC]
 		st	%g3, [%g2 + STACKFRAME_SZ + PT_V9_Y]
-		save	%g2, -STACK_BIAS, %sp	! Ordering here is critical
+
+		rdpr	%cansave, %g1
+		brnz,pt %g1, etrap_save
+		 nop
+
+		rdpr	%cwp, %g1
+		add	%g1, 2, %g1
+		wrpr	%g1, %cwp
+		be,pt	%xcc, etrap_user_spill
+		 mov	ASI_AIUP, %g3
+
+		rdpr	%otherwin, %g3
+		brz	%g3, etrap_kernel_spill
+		 mov	ASI_AIUS, %g3
+
+etrap_user_spill:
+
+		wr	%g3, 0x0, %asi
+		ldx	[%g6 + TI_FLAGS], %g3
+		and	%g3, _TIF_32BIT, %g3
+		brnz,pt	%g3, etrap_user_spill_32bit
+		 nop
+		ba,a,pt	%xcc, etrap_user_spill_64bit
+
+etrap_save:	save	%g2, -STACK_BIAS, %sp
 		mov	%g6, %l6
 
 		bne,pn	%xcc, 3f
@@ -70,42 +95,56 @@ etrap_irq:
 		wrpr	%g2, 0, %wstate
 		sethi	%hi(sparc64_kern_pri_context), %g2
 		ldx	[%g2 + %lo(sparc64_kern_pri_context)], %g3
-		stxa	%g3, [%l4] ASI_DMMU
-		flush	%l6
-		wr	%g0, ASI_AIUS, %asi
-2:		wrpr	%g0, 0x0, %tl
-		mov	%g4, %l4
+
+661:		stxa	%g3, [%l4] ASI_DMMU
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		stxa	%g3, [%l4] ASI_MMU
+		.previous
+
+		sethi	%hi(KERNBASE), %l4
+		flush	%l4
+		mov	ASI_AIUS, %l7
+2:		mov	%g4, %l4
 		mov	%g5, %l5
+		add	%g7, 4, %l2
+
+		/* Go to trap time globals so we can save them.  */
+661:		wrpr	%g0, ETRAP_PSTATE1, %pstate
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		SET_GL(0)
+		.previous
 
-		mov	%g7, %l2
-		wrpr	%g0, ETRAP_PSTATE1, %pstate
 		stx	%g1, [%sp + PTREGS_OFF + PT_V9_G1]
 		stx	%g2, [%sp + PTREGS_OFF + PT_V9_G2]
+		sllx	%l7, 24, %l7
 		stx	%g3, [%sp + PTREGS_OFF + PT_V9_G3]
+		rdpr	%cwp, %l0
 		stx	%g4, [%sp + PTREGS_OFF + PT_V9_G4]
 		stx	%g5, [%sp + PTREGS_OFF + PT_V9_G5]
 		stx	%g6, [%sp + PTREGS_OFF + PT_V9_G6]
-
 		stx	%g7, [%sp + PTREGS_OFF + PT_V9_G7]
+		or	%l7, %l0, %l7
+		sethi	%hi(TSTATE_RMO | TSTATE_PEF), %l0
+		or	%l7, %l0, %l7
+		wrpr	%l2, %tnpc
+		wrpr	%l7, (TSTATE_PRIV | TSTATE_IE), %tstate
 		stx	%i0, [%sp + PTREGS_OFF + PT_V9_I0]
 		stx	%i1, [%sp + PTREGS_OFF + PT_V9_I1]
 		stx	%i2, [%sp + PTREGS_OFF + PT_V9_I2]
 		stx	%i3, [%sp + PTREGS_OFF + PT_V9_I3]
 		stx	%i4, [%sp + PTREGS_OFF + PT_V9_I4]
 		stx	%i5, [%sp + PTREGS_OFF + PT_V9_I5]
-
 		stx	%i6, [%sp + PTREGS_OFF + PT_V9_I6]
-		stx	%i7, [%sp + PTREGS_OFF + PT_V9_I7]
-		wrpr	%g0, ETRAP_PSTATE2, %pstate
 		mov	%l6, %g6
-#ifdef CONFIG_SMP
-		mov	TSB_REG, %g3
-		ldxa	[%g3] ASI_IMMU, %g5
-#endif
-		jmpl	%l2 + 0x4, %g0
-		 ldx	[%g6 + TI_TASK], %g4
+		stx	%i7, [%sp + PTREGS_OFF + PT_V9_I7]
+		LOAD_PER_CPU_BASE(%g5, %g6, %g4, %g3, %l1)
+		ldx	[%g6 + TI_TASK], %g4
+		done
 
-3:		ldub	[%l6 + TI_FPDEPTH], %l5
+3:		mov	ASI_P, %l7
+		ldub	[%l6 + TI_FPDEPTH], %l5
 		add	%l6, TI_FPSAVED + 1, %l4
 		srl	%l5, 1, %l3
 		add	%l5, 2, %l5
@@ -125,6 +164,7 @@ etraptl1:	/* Save tstate/tpc/tnpc of TL 1-->4 and the tl register itself.
 		 *	0x58	TL4's TT
 		 *	0x60	TL
 		 */
+		TRAP_LOAD_THREAD_REG(%g6, %g1)
 		sub	%sp, ((4 * 8) * 4) + 8, %g2
 		rdpr	%tl, %g1
 
@@ -148,6 +188,11 @@ etraptl1:	/* Save tstate/tpc/tnpc of TL 1-->4 and the tl register itself.
 		rdpr	%tt, %g3
 		stx	%g3, [%g2 + STACK_BIAS + 0x38]
 
+		sethi	%hi(is_sun4v), %g3
+		lduw	[%g3 + %lo(is_sun4v)], %g3
+		brnz,pn	%g3, finish_tl1_capture
+		 nop
+
 		wrpr	%g0, 3, %tl
 		rdpr	%tstate, %g3
 		stx	%g3, [%g2 + STACK_BIAS + 0x40]
@@ -168,91 +213,20 @@ etraptl1:	/* Save tstate/tpc/tnpc of TL 1-->4 and the tl register itself.
 		rdpr	%tt, %g3
 		stx	%g3, [%g2 + STACK_BIAS + 0x78]
 
-		wrpr	%g1, %tl
 		stx	%g1, [%g2 + STACK_BIAS + 0x80]
 
+finish_tl1_capture:
+		wrpr	%g0, 1, %tl
+661:		nop
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		SET_GL(1)
+		.previous
+
 		rdpr	%tstate, %g1
 		sub	%g2, STACKFRAME_SZ + TRACEREG_SZ - STACK_BIAS, %g2
 		ba,pt	%xcc, 1b
 		 andcc	%g1, TSTATE_PRIV, %g0
 
-		.align	64
-		.globl	scetrap
-scetrap:	rdpr	%pil, %g2
-		rdpr	%tstate, %g1
-		sllx	%g2, 20, %g3
-		andcc	%g1, TSTATE_PRIV, %g0
-		or	%g1, %g3, %g1
-		bne,pn	%xcc, 1f
-		 sub	%sp, (STACKFRAME_SZ+TRACEREG_SZ-STACK_BIAS), %g2
-		wrpr	%g0, 7, %cleanwin
-
-		sllx	%g1, 51, %g3
-		sethi	%hi(TASK_REGOFF), %g2
-		or	%g2, %lo(TASK_REGOFF), %g2
-		brlz,pn	%g3, 1f
-		 add	%g6, %g2, %g2
-		wr	%g0, 0, %fprs
-1:		rdpr	%tpc, %g3
-		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TSTATE]
-
-		rdpr	%tnpc, %g1
-		stx	%g3, [%g2 + STACKFRAME_SZ + PT_V9_TPC]
-		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TNPC]
-		save	%g2, -STACK_BIAS, %sp	! Ordering here is critical
-		mov	%g6, %l6
-		bne,pn	%xcc, 2f
-		 mov	ASI_P, %l7
-		rdpr	%canrestore, %g3
-
-		rdpr	%wstate, %g2
-		wrpr	%g0, 0, %canrestore
-		sll	%g2, 3, %g2
-		mov	PRIMARY_CONTEXT, %l4
-		wrpr	%g3, 0, %otherwin
-		wrpr	%g2, 0, %wstate
-		sethi	%hi(sparc64_kern_pri_context), %g2
-		ldx	[%g2 + %lo(sparc64_kern_pri_context)], %g3
-		stxa	%g3, [%l4] ASI_DMMU
-		flush	%l6
-
-		mov	ASI_AIUS, %l7
-2:		mov	%g4, %l4
-		mov	%g5, %l5
-		add	%g7, 0x4, %l2
-		wrpr	%g0, ETRAP_PSTATE1, %pstate
-		stx	%g1, [%sp + PTREGS_OFF + PT_V9_G1]
-		stx	%g2, [%sp + PTREGS_OFF + PT_V9_G2]
-		sllx	%l7, 24, %l7
-
-		stx	%g3, [%sp + PTREGS_OFF + PT_V9_G3]
-		rdpr	%cwp, %l0
-		stx	%g4, [%sp + PTREGS_OFF + PT_V9_G4]
-		stx	%g5, [%sp + PTREGS_OFF + PT_V9_G5]
-		stx	%g6, [%sp + PTREGS_OFF + PT_V9_G6]
-		stx	%g7, [%sp + PTREGS_OFF + PT_V9_G7]
-		or	%l7, %l0, %l7
-		sethi	%hi(TSTATE_RMO | TSTATE_PEF), %l0
-
-		or	%l7, %l0, %l7
-		wrpr	%l2, %tnpc
-		wrpr	%l7, (TSTATE_PRIV | TSTATE_IE), %tstate
-		stx	%i0, [%sp + PTREGS_OFF + PT_V9_I0]
-		stx	%i1, [%sp + PTREGS_OFF + PT_V9_I1]
-		stx	%i2, [%sp + PTREGS_OFF + PT_V9_I2]
-		stx	%i3, [%sp + PTREGS_OFF + PT_V9_I3]
-		stx	%i4, [%sp + PTREGS_OFF + PT_V9_I4]
-
-		stx	%i5, [%sp + PTREGS_OFF + PT_V9_I5]
-		stx	%i6, [%sp + PTREGS_OFF + PT_V9_I6]
-		mov	%l6, %g6
-		stx	%i7, [%sp + PTREGS_OFF + PT_V9_I7]
-#ifdef CONFIG_SMP
-		mov	TSB_REG, %g3
-		ldxa	[%g3] ASI_IMMU, %g5
-#endif
-		ldx	[%g6 + TI_TASK], %g4
-		done
-
 #undef TASK_REGOFF
 #undef ETRAP_PSTATE1
diff --git a/arch/sparc64/kernel/head.S b/arch/sparc64/kernel/head.S
index b49dcd4504b0..3eadac5e171e 100644
--- a/arch/sparc64/kernel/head.S
+++ b/arch/sparc64/kernel/head.S
@@ -26,6 +26,7 @@
 #include <asm/head.h>
 #include <asm/ttable.h>
 #include <asm/mmu.h>
+#include <asm/cpudata.h>
 	
 /* This section from from _start to sparc64_boot_end should fit into
  * 0x0000000000404000 to 0x0000000000408000.
@@ -94,12 +95,17 @@ sparc64_boot:
 	wrpr	%g1, 0x0, %pstate
 	ba,a,pt	%xcc, 1f
 
-	.globl	prom_finddev_name, prom_chosen_path
-	.globl	prom_getprop_name, prom_mmu_name
-	.globl	prom_callmethod_name, prom_translate_name
+	.globl	prom_finddev_name, prom_chosen_path, prom_root_node
+	.globl	prom_getprop_name, prom_mmu_name, prom_peer_name
+	.globl	prom_callmethod_name, prom_translate_name, prom_root_compatible
 	.globl	prom_map_name, prom_unmap_name, prom_mmu_ihandle_cache
 	.globl	prom_boot_mapped_pc, prom_boot_mapping_mode
 	.globl	prom_boot_mapping_phys_high, prom_boot_mapping_phys_low
+	.globl	is_sun4v
+prom_peer_name:
+	.asciz	"peer"
+prom_compatible_name:
+	.asciz	"compatible"
 prom_finddev_name:
 	.asciz	"finddevice"
 prom_chosen_path:
@@ -116,7 +122,13 @@ prom_map_name:
 	.asciz	"map"
 prom_unmap_name:
 	.asciz	"unmap"
+prom_sun4v_name:
+	.asciz	"sun4v"
 	.align	4
+prom_root_compatible:
+	.skip	64
+prom_root_node:
+	.word	0
 prom_mmu_ihandle_cache:
 	.word	0
 prom_boot_mapped_pc:
@@ -128,8 +140,54 @@ prom_boot_mapping_phys_high:
 	.xword	0
 prom_boot_mapping_phys_low:
 	.xword	0
+is_sun4v:
+	.word	0
 1:
 	rd	%pc, %l0
+
+	mov	(1b - prom_peer_name), %l1
+	sub	%l0, %l1, %l1
+	mov	0, %l2
+
+	/* prom_root_node = prom_peer(0) */
+	stx	%l1, [%sp + 2047 + 128 + 0x00]	! service, "peer"
+	mov	1, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x08]	! num_args, 1
+	stx	%l3, [%sp + 2047 + 128 + 0x10]	! num_rets, 1
+	stx	%l2, [%sp + 2047 + 128 + 0x18]	! arg1, 0
+	stx	%g0, [%sp + 2047 + 128 + 0x20]	! ret1
+	call	%l7
+	 add	%sp, (2047 + 128), %o0		! argument array
+
+	ldx	[%sp + 2047 + 128 + 0x20], %l4	! prom root node
+	mov	(1b - prom_root_node), %l1
+	sub	%l0, %l1, %l1
+	stw	%l4, [%l1]
+
+	mov	(1b - prom_getprop_name), %l1
+	mov	(1b - prom_compatible_name), %l2
+	mov	(1b - prom_root_compatible), %l5
+	sub	%l0, %l1, %l1
+	sub	%l0, %l2, %l2
+	sub	%l0, %l5, %l5
+
+	/* prom_getproperty(prom_root_node, "compatible",
+	 *                  &prom_root_compatible, 64)
+	 */
+	stx	%l1, [%sp + 2047 + 128 + 0x00]	! service, "getprop"
+	mov	4, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x08]	! num_args, 4
+	mov	1, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x10]	! num_rets, 1
+	stx	%l4, [%sp + 2047 + 128 + 0x18]	! arg1, prom_root_node
+	stx	%l2, [%sp + 2047 + 128 + 0x20]	! arg2, "compatible"
+	stx	%l5, [%sp + 2047 + 128 + 0x28]	! arg3, &prom_root_compatible
+	mov	64, %l3
+	stx	%l3, [%sp + 2047 + 128 + 0x30]	! arg4, size
+	stx	%g0, [%sp + 2047 + 128 + 0x38]	! ret1
+	call	%l7
+	 add	%sp, (2047 + 128), %o0		! argument array
+
 	mov	(1b - prom_finddev_name), %l1
 	mov	(1b - prom_chosen_path), %l2
 	mov	(1b - prom_boot_mapped_pc), %l3
@@ -238,6 +296,27 @@ prom_boot_mapping_phys_low:
 	add	%sp, (192 + 128), %sp
 
 sparc64_boot_after_remap:
+	sethi	%hi(prom_root_compatible), %g1
+	or	%g1, %lo(prom_root_compatible), %g1
+	sethi	%hi(prom_sun4v_name), %g7
+	or	%g7, %lo(prom_sun4v_name), %g7
+	mov	5, %g3
+1:	ldub	[%g7], %g2
+	ldub	[%g1], %g4
+	cmp	%g2, %g4
+	bne,pn	%icc, 2f
+	 add	%g7, 1, %g7
+	subcc	%g3, 1, %g3
+	bne,pt	%xcc, 1b
+	 add	%g1, 1, %g1
+
+	sethi	%hi(is_sun4v), %g1
+	or	%g1, %lo(is_sun4v), %g1
+	mov	1, %g7
+	stw	%g7, [%g1]
+
+2:
+	BRANCH_IF_SUN4V(g1, jump_to_sun4u_init)
 	BRANCH_IF_CHEETAH_BASE(g1,g7,cheetah_boot)
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,cheetah_plus_boot)
 	ba,pt	%xcc, spitfire_boot
@@ -301,20 +380,58 @@ jump_to_sun4u_init:
 	 nop
 
 sun4u_init:
+	BRANCH_IF_SUN4V(g1, sun4v_init)
+
 	/* Set ctx 0 */
-	mov	PRIMARY_CONTEXT, %g7
-	stxa	%g0, [%g7] ASI_DMMU
-	membar	#Sync
+	mov		PRIMARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_DMMU
+	membar		#Sync
 
-	mov	SECONDARY_CONTEXT, %g7
-	stxa	%g0, [%g7] ASI_DMMU
+	mov		SECONDARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_DMMU
 	membar	#Sync
 
-	BRANCH_IF_ANY_CHEETAH(g1,g7,cheetah_tlb_fixup)
+	ba,pt		%xcc, sun4u_continue
+	 nop
+
+sun4v_init:
+	/* Set ctx 0 */
+	mov		PRIMARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_MMU
+	membar		#Sync
+
+	mov		SECONDARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_MMU
+	membar		#Sync
+	ba,pt		%xcc, niagara_tlb_fixup
+	 nop
+
+sun4u_continue:
+	BRANCH_IF_ANY_CHEETAH(g1, g7, cheetah_tlb_fixup)
 
 	ba,pt	%xcc, spitfire_tlb_fixup
 	 nop
 
+niagara_tlb_fixup:
+	mov	3, %g2		/* Set TLB type to hypervisor. */
+	sethi	%hi(tlb_type), %g1
+	stw	%g2, [%g1 + %lo(tlb_type)]
+
+	/* Patch copy/clear ops.  */
+	call	niagara_patch_copyops
+	 nop
+	call	niagara_patch_bzero
+	 nop
+	call	niagara_patch_pageops
+	 nop
+
+	/* Patch TLB/cache ops.  */
+	call	hypervisor_patch_cachetlbops
+	 nop
+
+	ba,pt	%xcc, tlb_fixup_done
+	 nop
+
 cheetah_tlb_fixup:
 	mov	2, %g2		/* Set TLB type to cheetah+. */
 	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g7,1f)
@@ -411,85 +528,55 @@ setup_trap_table:
 	wrpr	%g0, 15, %pil
 
 	/* Make the firmware call to jump over to the Linux trap table.  */
-	call	prom_set_trap_table
-	 sethi	%hi(sparc64_ttable_tl0), %o0
+	sethi	%hi(is_sun4v), %o0
+	lduw	[%o0 + %lo(is_sun4v)], %o0
+	brz,pt	%o0, 1f
+	 nop
 
-	/* Start using proper page size encodings in ctx register.  */
-	sethi	%hi(sparc64_kern_pri_context), %g3
-	ldx	[%g3 + %lo(sparc64_kern_pri_context)], %g2
-	mov	PRIMARY_CONTEXT, %g1
-	stxa	%g2, [%g1] ASI_DMMU
-	membar	#Sync
+	TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
+	add	%g2, TRAP_PER_CPU_FAULT_INFO, %g2
+	stxa	%g2, [%g0] ASI_SCRATCHPAD
 
-	/* The Linux trap handlers expect various trap global registers
-	 * to be setup with some fixed values.  So here we set these
-	 * up very carefully.  These globals are:
-	 *
-	 * Alternate Globals (PSTATE_AG):
-	 *
-	 * %g6			--> current_thread_info()
-	 *
-	 * MMU Globals (PSTATE_MG):
-	 *
-	 * %g1			--> TLB_SFSR
-	 * %g2			--> ((_PAGE_VALID | _PAGE_SZ4MB |
-	 *			      _PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-	 *			     ^ 0xfffff80000000000)
-	 * (this %g2 value is used for computing the PAGE_OFFSET kernel
-	 *  TLB entries quickly, the virtual address of the fault XOR'd
-	 *  with this %g2 value is the PTE to load into the TLB)
-	 * %g3			--> VPTE_BASE_CHEETAH or VPTE_BASE_SPITFIRE
+	/* Compute physical address:
 	 *
-	 * Interrupt Globals (PSTATE_IG, setup by init_irqwork_curcpu()):
-	 *
-	 * %g6			--> __irq_work[smp_processor_id()]
+	 * paddr = kern_base + (mmfsa_vaddr - KERNBASE)
 	 */
+	sethi	%hi(KERNBASE), %g3
+	sub	%g2, %g3, %g2
+	sethi	%hi(kern_base), %g3
+	ldx	[%g3 + %lo(kern_base)], %g3
+	add	%g2, %g3, %o1
 
-	rdpr	%pstate, %o1
-	mov	%g6, %o2
-	wrpr	%o1, PSTATE_AG, %pstate
-	mov	%o2, %g6
-
-#define KERN_HIGHBITS		((_PAGE_VALID|_PAGE_SZ4MB)^0xfffff80000000000)
-#define KERN_LOWBITS		(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-	wrpr	%o1, PSTATE_MG, %pstate
-	mov	TSB_REG, %g1
-	stxa	%g0, [%g1] ASI_DMMU
-	membar	#Sync
-	stxa	%g0, [%g1] ASI_IMMU
-	membar	#Sync
-	mov	TLB_SFSR, %g1
-	sethi	%uhi(KERN_HIGHBITS), %g2
-	or	%g2, %ulo(KERN_HIGHBITS), %g2
-	sllx	%g2, 32, %g2
-	or	%g2, KERN_LOWBITS, %g2
-
-	BRANCH_IF_ANY_CHEETAH(g3,g7,8f)
-	ba,pt	%xcc, 9f
+	call	prom_set_trap_table_sun4v
+	 sethi	%hi(sparc64_ttable_tl0), %o0
+
+	ba,pt	%xcc, 2f
 	 nop
 
-8:
-	sethi		%uhi(VPTE_BASE_CHEETAH), %g3
-	or		%g3, %ulo(VPTE_BASE_CHEETAH), %g3
-	ba,pt		%xcc, 2f
-	 sllx		%g3, 32, %g3
+1:	call	prom_set_trap_table
+	 sethi	%hi(sparc64_ttable_tl0), %o0
 
-9:
-	sethi		%uhi(VPTE_BASE_SPITFIRE), %g3
-	or		%g3, %ulo(VPTE_BASE_SPITFIRE), %g3
-	sllx		%g3, 32, %g3
+	/* Start using proper page size encodings in ctx register.  */
+2:	sethi	%hi(sparc64_kern_pri_context), %g3
+	ldx	[%g3 + %lo(sparc64_kern_pri_context)], %g2
 
-2:
-	clr	%g7
-#undef KERN_HIGHBITS
-#undef KERN_LOWBITS
+	mov		PRIMARY_CONTEXT, %g1
+
+661:	stxa		%g2, [%g1] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g1] ASI_MMU
+	.previous
+
+	membar	#Sync
 
 	/* Kill PROM timer */
 	sethi	%hi(0x80000000), %o2
 	sllx	%o2, 32, %o2
 	wr	%o2, 0, %tick_cmpr
 
-	BRANCH_IF_ANY_CHEETAH(o2,o3,1f)
+	BRANCH_IF_SUN4V(o2, 1f)
+	BRANCH_IF_ANY_CHEETAH(o2, o3, 1f)
 
 	ba,pt	%xcc, 2f
 	 nop
@@ -502,7 +589,6 @@ setup_trap_table:
 
 2:
 	wrpr	%g0, %g0, %wstate
-	wrpr	%o1, 0x0, %pstate
 
 	call	init_irqwork_curcpu
 	 nop
@@ -517,7 +603,7 @@ setup_trap_table:
 	 restore
 
 	.globl	setup_tba
-setup_tba:	/* i0 = is_starfire */
+setup_tba:
 	save	%sp, -192, %sp
 
 	/* The boot processor is the only cpu which invokes this
@@ -536,31 +622,35 @@ setup_tba:	/* i0 = is_starfire */
 	 restore
 sparc64_boot_end:
 
-#include "systbls.S"
 #include "ktlb.S"
+#include "tsb.S"
 #include "etrap.S"
 #include "rtrap.S"
 #include "winfixup.S"
 #include "entry.S"
+#include "sun4v_tlb_miss.S"
+#include "sun4v_ivec.S"
 
 /*
  * The following skip makes sure the trap table in ttable.S is aligned
  * on a 32K boundary as required by the v9 specs for TBA register.
+ *
+ * We align to a 32K boundary, then we have the 32K kernel TSB,
+ * then the 32K aligned trap table.
  */
 1:
 	.skip	0x4000 + _start - 1b
 
-#ifdef CONFIG_SBUS
-/* This is just a hack to fool make depend config.h discovering
-   strategy: As the .S files below need config.h, but
-   make depend does not find it for them, we include config.h
-   in head.S */
-#endif
+	.globl	swapper_tsb
+swapper_tsb:
+	.skip	(32 * 1024)
 
 ! 0x0000000000408000
 
 #include "ttable.S"
 
+#include "systbls.S"
+
 	.data
 	.align	8
 	.globl	prom_tba, tlb_type
diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c
index 233526ba3abe..11e645c9ec50 100644
--- a/arch/sparc64/kernel/irq.c
+++ b/arch/sparc64/kernel/irq.c
@@ -21,6 +21,7 @@
 #include <linux/delay.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/bootmem.h>
 
 #include <asm/ptrace.h>
 #include <asm/processor.h>
@@ -39,6 +40,7 @@
 #include <asm/cache.h>
 #include <asm/cpudata.h>
 #include <asm/auxio.h>
+#include <asm/head.h>
 
 #ifdef CONFIG_SMP
 static void distribute_irqs(void);
@@ -115,9 +117,7 @@ int show_interrupts(struct seq_file *p, void *v)
 #ifndef CONFIG_SMP
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-		for (j = 0; j < NR_CPUS; j++) {
-			if (!cpu_online(j))
-				continue;
+		for_each_online_cpu(j) {
 			seq_printf(p, "%10u ",
 				   kstat_cpu(j).irqs[i]);
 		}
@@ -136,12 +136,48 @@ out_unlock:
 	return 0;
 }
 
+extern unsigned long real_hard_smp_processor_id(void);
+
+static unsigned int sun4u_compute_tid(unsigned long imap, unsigned long cpuid)
+{
+	unsigned int tid;
+
+	if (this_is_starfire) {
+		tid = starfire_translate(imap, cpuid);
+		tid <<= IMAP_TID_SHIFT;
+		tid &= IMAP_TID_UPA;
+	} else {
+		if (tlb_type == cheetah || tlb_type == cheetah_plus) {
+			unsigned long ver;
+
+			__asm__ ("rdpr %%ver, %0" : "=r" (ver));
+			if ((ver >> 32UL) == __JALAPENO_ID ||
+			    (ver >> 32UL) == __SERRANO_ID) {
+				tid = cpuid << IMAP_TID_SHIFT;
+				tid &= IMAP_TID_JBUS;
+			} else {
+				unsigned int a = cpuid & 0x1f;
+				unsigned int n = (cpuid >> 5) & 0x1f;
+
+				tid = ((a << IMAP_AID_SHIFT) |
+				       (n << IMAP_NID_SHIFT));
+				tid &= (IMAP_AID_SAFARI |
+					IMAP_NID_SAFARI);;
+			}
+		} else {
+			tid = cpuid << IMAP_TID_SHIFT;
+			tid &= IMAP_TID_UPA;
+		}
+	}
+
+	return tid;
+}
+
 /* Now these are always passed a true fully specified sun4u INO. */
 void enable_irq(unsigned int irq)
 {
 	struct ino_bucket *bucket = __bucket(irq);
-	unsigned long imap;
-	unsigned long tid;
+	unsigned long imap, cpuid;
 
 	imap = bucket->imap;
 	if (imap == 0UL)
@@ -149,47 +185,38 @@ void enable_irq(unsigned int irq)
 
 	preempt_disable();
 
-	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		unsigned long ver;
-
-		__asm__ ("rdpr %%ver, %0" : "=r" (ver));
-		if ((ver >> 32) == 0x003e0016) {
-			/* We set it to our JBUS ID. */
-			__asm__ __volatile__("ldxa [%%g0] %1, %0"
-					     : "=r" (tid)
-					     : "i" (ASI_JBUS_CONFIG));
-			tid = ((tid & (0x1fUL<<17)) << 9);
-			tid &= IMAP_TID_JBUS;
-		} else {
-			/* We set it to our Safari AID. */
-			__asm__ __volatile__("ldxa [%%g0] %1, %0"
-					     : "=r" (tid)
-					     : "i" (ASI_SAFARI_CONFIG));
-			tid = ((tid & (0x3ffUL<<17)) << 9);
-			tid &= IMAP_AID_SAFARI;
-		}
-	} else if (this_is_starfire == 0) {
-		/* We set it to our UPA MID. */
-		__asm__ __volatile__("ldxa [%%g0] %1, %0"
-				     : "=r" (tid)
-				     : "i" (ASI_UPA_CONFIG));
-		tid = ((tid & UPA_CONFIG_MID) << 9);
-		tid &= IMAP_TID_UPA;
+	/* This gets the physical processor ID, even on uniprocessor,
+	 * so we can always program the interrupt target correctly.
+	 */
+	cpuid = real_hard_smp_processor_id();
+
+	if (tlb_type == hypervisor) {
+		unsigned int ino = __irq_ino(irq);
+		int err;
+
+		err = sun4v_intr_settarget(ino, cpuid);
+		if (err != HV_EOK)
+			printk("sun4v_intr_settarget(%x,%lu): err(%d)\n",
+			       ino, cpuid, err);
+		err = sun4v_intr_setenabled(ino, HV_INTR_ENABLED);
+		if (err != HV_EOK)
+			printk("sun4v_intr_setenabled(%x): err(%d)\n",
+			       ino, err);
 	} else {
-		tid = (starfire_translate(imap, smp_processor_id()) << 26);
-		tid &= IMAP_TID_UPA;
+		unsigned int tid = sun4u_compute_tid(imap, cpuid);
+
+		/* NOTE NOTE NOTE, IGN and INO are read-only, IGN is a product
+		 * of this SYSIO's preconfigured IGN in the SYSIO Control
+		 * Register, the hardware just mirrors that value here.
+		 * However for Graphics and UPA Slave devices the full
+		 * IMAP_INR field can be set by the programmer here.
+		 *
+		 * Things like FFB can now be handled via the new IRQ
+		 * mechanism.
+		 */
+		upa_writel(tid | IMAP_VALID, imap);
 	}
 
-	/* NOTE NOTE NOTE, IGN and INO are read-only, IGN is a product
-	 * of this SYSIO's preconfigured IGN in the SYSIO Control
-	 * Register, the hardware just mirrors that value here.
-	 * However for Graphics and UPA Slave devices the full
-	 * IMAP_INR field can be set by the programmer here.
-	 *
-	 * Things like FFB can now be handled via the new IRQ mechanism.
-	 */
-	upa_writel(tid | IMAP_VALID, imap);
-
 	preempt_enable();
 }
 
@@ -201,16 +228,26 @@ void disable_irq(unsigned int irq)
 
 	imap = bucket->imap;
 	if (imap != 0UL) {
-		u32 tmp;
+		if (tlb_type == hypervisor) {
+			unsigned int ino = __irq_ino(irq);
+			int err;
+
+			err = sun4v_intr_setenabled(ino, HV_INTR_DISABLED);
+			if (err != HV_EOK)
+				printk("sun4v_intr_setenabled(%x): "
+				       "err(%d)\n", ino, err);
+		} else {
+			u32 tmp;
 
-		/* NOTE: We do not want to futz with the IRQ clear registers
-		 *       and move the state to IDLE, the SCSI code does call
-		 *       disable_irq() to assure atomicity in the queue cmd
-		 *       SCSI adapter driver code.  Thus we'd lose interrupts.
-		 */
-		tmp = upa_readl(imap);
-		tmp &= ~IMAP_VALID;
-		upa_writel(tmp, imap);
+			/* NOTE: We do not want to futz with the IRQ clear registers
+			 *       and move the state to IDLE, the SCSI code does call
+			 *       disable_irq() to assure atomicity in the queue cmd
+			 *       SCSI adapter driver code.  Thus we'd lose interrupts.
+			 */
+			tmp = upa_readl(imap);
+			tmp &= ~IMAP_VALID;
+			upa_writel(tmp, imap);
+		}
 	}
 }
 
@@ -248,6 +285,8 @@ unsigned int build_irq(int pil, int inofixup, unsigned long iclr, unsigned long
 		return __irq(&pil0_dummy_bucket);
 	}
 
+	BUG_ON(tlb_type == hypervisor);
+
 	/* RULE: Both must be specified in all other cases. */
 	if (iclr == 0UL || imap == 0UL) {
 		prom_printf("Invalid build_irq %d %d %016lx %016lx\n",
@@ -275,12 +314,11 @@ unsigned int build_irq(int pil, int inofixup, unsigned long iclr, unsigned long
 		goto out;
 	}
 
-	bucket->irq_info = kmalloc(sizeof(struct irq_desc), GFP_ATOMIC);
+	bucket->irq_info = kzalloc(sizeof(struct irq_desc), GFP_ATOMIC);
 	if (!bucket->irq_info) {
 		prom_printf("IRQ: Error, kmalloc(irq_desc) failed.\n");
 		prom_halt();
 	}
-	memset(bucket->irq_info, 0, sizeof(struct irq_desc));
 
 	/* Ok, looks good, set it up.  Don't touch the irq_chain or
 	 * the pending flag.
@@ -294,6 +332,37 @@ out:
 	return __irq(bucket);
 }
 
+unsigned int sun4v_build_irq(u32 devhandle, unsigned int devino, int pil, unsigned char flags)
+{
+	struct ino_bucket *bucket;
+	unsigned long sysino;
+
+	sysino = sun4v_devino_to_sysino(devhandle, devino);
+
+	bucket = &ivector_table[sysino];
+
+	/* Catch accidental accesses to these things.  IMAP/ICLR handling
+	 * is done by hypervisor calls on sun4v platforms, not by direct
+	 * register accesses.
+	 *
+	 * But we need to make them look unique for the disable_irq() logic
+	 * in free_irq().
+	 */
+	bucket->imap = ~0UL - sysino;
+	bucket->iclr = ~0UL - sysino;
+
+	bucket->pil = pil;
+	bucket->flags = flags;
+
+	bucket->irq_info = kzalloc(sizeof(struct irq_desc), GFP_ATOMIC);
+	if (!bucket->irq_info) {
+		prom_printf("IRQ: Error, kmalloc(irq_desc) failed.\n");
+		prom_halt();
+	}
+
+	return __irq(bucket);
+}
+
 static void atomic_bucket_insert(struct ino_bucket *bucket)
 {
 	unsigned long pstate;
@@ -482,7 +551,6 @@ void free_irq(unsigned int irq, void *dev_id)
 	bucket = __bucket(irq);
 	if (bucket != &pil0_dummy_bucket) {
 		struct irq_desc *desc = bucket->irq_info;
-		unsigned long imap = bucket->imap;
 		int ent, i;
 
 		for (i = 0; i < MAX_IRQ_DESC_ACTION; i++) {
@@ -495,6 +563,8 @@ void free_irq(unsigned int irq, void *dev_id)
 		}
 
 		if (!desc->action_active_mask) {
+			unsigned long imap = bucket->imap;
+
 			/* This unique interrupt source is now inactive. */
 			bucket->flags &= ~IBF_ACTIVE;
 
@@ -592,7 +662,18 @@ static void process_bucket(int irq, struct ino_bucket *bp, struct pt_regs *regs)
 			break;
 	}
 	if (bp->pil != 0) {
-		upa_writel(ICLR_IDLE, bp->iclr);
+		if (tlb_type == hypervisor) {
+			unsigned int ino = __irq_ino(bp);
+			int err;
+
+			err = sun4v_intr_setstate(ino, HV_INTR_STATE_IDLE);
+			if (err != HV_EOK)
+				printk("sun4v_intr_setstate(%x): "
+				       "err(%d)\n", ino, err);
+		} else {
+			upa_writel(ICLR_IDLE, bp->iclr);
+		}
+
 		/* Test and add entropy */
 		if (random & SA_SAMPLE_RANDOM)
 			add_interrupt_randomness(irq);
@@ -646,7 +727,7 @@ void handler_irq(int irq, struct pt_regs *regs)
 }
 
 #ifdef CONFIG_BLK_DEV_FD
-extern irqreturn_t floppy_interrupt(int, void *, struct pt_regs *);;
+extern irqreturn_t floppy_interrupt(int, void *, struct pt_regs *);
 
 /* XXX No easy way to include asm/floppy.h XXX */
 extern unsigned char *pdma_vaddr;
@@ -694,7 +775,7 @@ irqreturn_t sparc_floppy_irq(int irq, void *dev_cookie, struct pt_regs *regs)
 		val = readb(auxio_register);
 		val |= AUXIO_AUX1_FTCNT;
 		writeb(val, auxio_register);
-		val &= AUXIO_AUX1_FTCNT;
+		val &= ~AUXIO_AUX1_FTCNT;
 		writeb(val, auxio_register);
 
 		doing_pdma = 0;
@@ -727,25 +808,23 @@ EXPORT_SYMBOL(probe_irq_off);
 static int retarget_one_irq(struct irqaction *p, int goal_cpu)
 {
 	struct ino_bucket *bucket = get_ino_in_irqaction(p) + ivector_table;
-	unsigned long imap = bucket->imap;
-	unsigned int tid;
 
 	while (!cpu_online(goal_cpu)) {
 		if (++goal_cpu >= NR_CPUS)
 			goal_cpu = 0;
 	}
 
-	if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		tid = goal_cpu << 26;
-		tid &= IMAP_AID_SAFARI;
-	} else if (this_is_starfire == 0) {
-		tid = goal_cpu << 26;
-		tid &= IMAP_TID_UPA;
+	if (tlb_type == hypervisor) {
+		unsigned int ino = __irq_ino(bucket);
+
+		sun4v_intr_settarget(ino, goal_cpu);
+		sun4v_intr_setenabled(ino, HV_INTR_ENABLED);
 	} else {
-		tid = (starfire_translate(imap, goal_cpu) << 26);
-		tid &= IMAP_TID_UPA;
+		unsigned long imap = bucket->imap;
+		unsigned int tid = sun4u_compute_tid(imap, goal_cpu);
+
+		upa_writel(tid | IMAP_VALID, imap);
 	}
-	upa_writel(tid | IMAP_VALID, imap);
 
 	do {
 		if (++goal_cpu >= NR_CPUS)
@@ -848,33 +927,114 @@ static void kill_prom_timer(void)
 
 void init_irqwork_curcpu(void)
 {
-	register struct irq_work_struct *workp asm("o2");
-	register unsigned long tmp asm("o3");
 	int cpu = hard_smp_processor_id();
 
-	memset(__irq_work + cpu, 0, sizeof(*workp));
-
-	/* Make sure we are called with PSTATE_IE disabled.  */
-	__asm__ __volatile__("rdpr	%%pstate, %0\n\t"
-			     : "=r" (tmp));
-	if (tmp & PSTATE_IE) {
-		prom_printf("BUG: init_irqwork_curcpu() called with "
-			    "PSTATE_IE enabled, bailing.\n");
-		__asm__ __volatile__("mov	%%i7, %0\n\t"
-				     : "=r" (tmp));
-		prom_printf("BUG: Called from %lx\n", tmp);
+	memset(__irq_work + cpu, 0, sizeof(struct irq_work_struct));
+}
+
+static void __cpuinit register_one_mondo(unsigned long paddr, unsigned long type)
+{
+	unsigned long num_entries = 128;
+	unsigned long status;
+
+	status = sun4v_cpu_qconf(type, paddr, num_entries);
+	if (status != HV_EOK) {
+		prom_printf("SUN4V: sun4v_cpu_qconf(%lu:%lx:%lu) failed, "
+			    "err %lu\n", type, paddr, num_entries, status);
 		prom_halt();
 	}
+}
 
-	/* Set interrupt globals.  */
-	workp = &__irq_work[cpu];
-	__asm__ __volatile__(
-	"rdpr	%%pstate, %0\n\t"
-	"wrpr	%0, %1, %%pstate\n\t"
-	"mov	%2, %%g6\n\t"
-	"wrpr	%0, 0x0, %%pstate\n\t"
-	: "=&r" (tmp)
-	: "i" (PSTATE_IG), "r" (workp));
+static void __cpuinit sun4v_register_mondo_queues(int this_cpu)
+{
+	struct trap_per_cpu *tb = &trap_block[this_cpu];
+
+	register_one_mondo(tb->cpu_mondo_pa, HV_CPU_QUEUE_CPU_MONDO);
+	register_one_mondo(tb->dev_mondo_pa, HV_CPU_QUEUE_DEVICE_MONDO);
+	register_one_mondo(tb->resum_mondo_pa, HV_CPU_QUEUE_RES_ERROR);
+	register_one_mondo(tb->nonresum_mondo_pa, HV_CPU_QUEUE_NONRES_ERROR);
+}
+
+static void __cpuinit alloc_one_mondo(unsigned long *pa_ptr, int use_bootmem)
+{
+	void *page;
+
+	if (use_bootmem)
+		page = alloc_bootmem_low_pages(PAGE_SIZE);
+	else
+		page = (void *) get_zeroed_page(GFP_ATOMIC);
+
+	if (!page) {
+		prom_printf("SUN4V: Error, cannot allocate mondo queue.\n");
+		prom_halt();
+	}
+
+	*pa_ptr = __pa(page);
+}
+
+static void __cpuinit alloc_one_kbuf(unsigned long *pa_ptr, int use_bootmem)
+{
+	void *page;
+
+	if (use_bootmem)
+		page = alloc_bootmem_low_pages(PAGE_SIZE);
+	else
+		page = (void *) get_zeroed_page(GFP_ATOMIC);
+
+	if (!page) {
+		prom_printf("SUN4V: Error, cannot allocate kbuf page.\n");
+		prom_halt();
+	}
+
+	*pa_ptr = __pa(page);
+}
+
+static void __cpuinit init_cpu_send_mondo_info(struct trap_per_cpu *tb, int use_bootmem)
+{
+#ifdef CONFIG_SMP
+	void *page;
+
+	BUILD_BUG_ON((NR_CPUS * sizeof(u16)) > (PAGE_SIZE - 64));
+
+	if (use_bootmem)
+		page = alloc_bootmem_low_pages(PAGE_SIZE);
+	else
+		page = (void *) get_zeroed_page(GFP_ATOMIC);
+
+	if (!page) {
+		prom_printf("SUN4V: Error, cannot allocate cpu mondo page.\n");
+		prom_halt();
+	}
+
+	tb->cpu_mondo_block_pa = __pa(page);
+	tb->cpu_list_pa = __pa(page + 64);
+#endif
+}
+
+/* Allocate and register the mondo and error queues for this cpu.  */
+void __cpuinit sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load)
+{
+	struct trap_per_cpu *tb = &trap_block[cpu];
+
+	if (alloc) {
+		alloc_one_mondo(&tb->cpu_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->dev_mondo_pa, use_bootmem);
+		alloc_one_mondo(&tb->resum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->resum_kernel_buf_pa, use_bootmem);
+		alloc_one_mondo(&tb->nonresum_mondo_pa, use_bootmem);
+		alloc_one_kbuf(&tb->nonresum_kernel_buf_pa, use_bootmem);
+
+		init_cpu_send_mondo_info(tb, use_bootmem);
+	}
+
+	if (load) {
+		if (cpu != hard_smp_processor_id()) {
+			prom_printf("SUN4V: init mondo on cpu %d not %d\n",
+				    cpu, hard_smp_processor_id());
+			prom_halt();
+		}
+		sun4v_register_mondo_queues(cpu);
+	}
 }
 
 /* Only invoked on boot processor. */
@@ -884,6 +1044,9 @@ void __init init_IRQ(void)
 	kill_prom_timer();
 	memset(&ivector_table[0], 0, sizeof(ivector_table));
 
+	if (tlb_type == hypervisor)
+		sun4v_init_mondo_queues(1, hard_smp_processor_id(), 1, 1);
+
 	/* We need to clear any IRQ's pending in the soft interrupt
 	 * registers, a spurious one could be left around from the
 	 * PROM timer which we just disabled.
diff --git a/arch/sparc64/kernel/itlb_base.S b/arch/sparc64/kernel/itlb_base.S
deleted file mode 100644
index 4951ff8f6877..000000000000
--- a/arch/sparc64/kernel/itlb_base.S
+++ /dev/null
@@ -1,79 +0,0 @@
-/* $Id: itlb_base.S,v 1.12 2002/02/09 19:49:30 davem Exp $
- * itlb_base.S:	Front end to ITLB miss replacement strategy.
- *              This is included directly into the trap table.
- *
- * Copyright (C) 1996,1998 David S. Miller (davem@redhat.com)
- * Copyright (C) 1997,1998 Jakub Jelinek   (jj@ultra.linux.cz)
- */
-
-#if PAGE_SHIFT == 13
-/*
- * To compute vpte offset, we need to do ((addr >> 13) << 3),
- * which can be optimized to (addr >> 10) if bits 10/11/12 can
- * be guaranteed to be 0 ... mmu_context.h does guarantee this
- * by only using 10 bits in the hwcontext value.
- */
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, 10, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) nop
-#else /* PAGE_SHIFT */
-#define CREATE_VPTE_OFFSET1(r1, r2) \
-				srax	r1, PAGE_SHIFT, r2
-#define CREATE_VPTE_OFFSET2(r1, r2) \
-				sllx	r2, 3, r2
-#endif /* PAGE_SHIFT */
-
-
-/* Ways we can get here:
- *
- * 1) Nucleus instruction misses from module code.
- * 2) All user instruction misses.
- *
- * All real page faults merge their code paths to the
- * sparc64_realfault_common label below.
- */
-
-/* ITLB ** ICACHE line 1: Quick user TLB misses		*/
-	mov		TLB_SFSR, %g1
-	ldxa		[%g1 + %g1] ASI_IMMU, %g4	! Get TAG_ACCESS
-	CREATE_VPTE_OFFSET1(%g4, %g6)			! Create VPTE offset
-	CREATE_VPTE_OFFSET2(%g4, %g6)			! Create VPTE offset
-	ldxa		[%g3 + %g6] ASI_P, %g5		! Load VPTE
-1:	brgez,pn	%g5, 3f				! Not valid, branch out
-	 sethi		%hi(_PAGE_EXEC), %g4		! Delay-slot
-	andcc		%g5, %g4, %g0			! Executable?
-
-/* ITLB ** ICACHE line 2: Real faults			*/
-	be,pn		%xcc, 3f			! Nope, branch.
-	 nop						! Delay-slot
-2:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN	! Load PTE into TLB
-	retry						! Trap return
-3:	rdpr		%pstate, %g4			! Move into alt-globals
-	wrpr		%g4, PSTATE_AG|PSTATE_MG, %pstate
-	rdpr		%tpc, %g5			! And load faulting VA
-	mov		FAULT_CODE_ITLB, %g4		! It was read from ITLB
-
-/* ITLB ** ICACHE line 3: Finish faults	*/
-sparc64_realfault_common:				! Called by dtlb_miss
-	stb		%g4, [%g6 + TI_FAULT_CODE]
-	stx		%g5, [%g6 + TI_FAULT_ADDR]
-	ba,pt		%xcc, etrap			! Save state
-1:	 rd		%pc, %g7			! ...
-	call		do_sparc64_fault		! Call fault handler
-	 add		%sp, PTREGS_OFF, %o0! Compute pt_regs arg
-	ba,pt		%xcc, rtrap_clr_l6		! Restore cpu state
-	 nop
-
-/* ITLB ** ICACHE line 4: Window fixups */
-winfix_trampoline:
-	rdpr		%tpc, %g3			! Prepare winfixup TNPC
-	or		%g3, 0x7c, %g3			! Compute branch offset
-	wrpr		%g3, %tnpc			! Write it into TNPC
-	done						! Do it to it
-	nop
-	nop
-	nop
-	nop
-
-#undef CREATE_VPTE_OFFSET1
-#undef CREATE_VPTE_OFFSET2
diff --git a/arch/sparc64/kernel/itlb_miss.S b/arch/sparc64/kernel/itlb_miss.S
new file mode 100644
index 000000000000..ad46e2024f4b
--- /dev/null
+++ b/arch/sparc64/kernel/itlb_miss.S
@@ -0,0 +1,39 @@
+/* ITLB ** ICACHE line 1: Context 0 check and TSB load	*/
+	ldxa	[%g0] ASI_IMMU_TSB_8KB_PTR, %g1	! Get TSB 8K pointer
+	ldxa	[%g0] ASI_IMMU, %g6		! Get TAG TARGET
+	srlx	%g6, 48, %g5			! Get context
+	sllx	%g6, 22, %g6			! Zero out context
+	brz,pn	%g5, kvmap_itlb			! Context 0 processing
+	 srlx	%g6, 22, %g6			! Delay slot
+	TSB_LOAD_QUAD(%g1, %g4)			! Load TSB entry
+	cmp	%g4, %g6			! Compare TAG
+
+/* ITLB ** ICACHE line 2: TSB compare and TLB load	*/
+	bne,pn	%xcc, tsb_miss_itlb		! Miss
+	 mov	FAULT_CODE_ITLB, %g3
+	andcc	%g5, _PAGE_EXEC_4U, %g0		! Executable?
+	be,pn	%xcc, tsb_do_fault
+	 nop					! Delay slot, fill me
+	stxa	%g5, [%g0] ASI_ITLB_DATA_IN	! Load TLB
+	retry					! Trap done
+	nop
+
+/* ITLB ** ICACHE line 3: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+
+/* ITLB ** ICACHE line 4: 				*/
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index b9a9ce70e55c..ffc7309e9f22 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -6,9 +6,11 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
+#include <linux/module.h>
 #include <asm/kdebug.h>
 #include <asm/signal.h>
 #include <asm/cacheflush.h>
+#include <asm/uaccess.h>
 
 /* We do not have hardware single-stepping on sparc64.
  * So we implement software single-stepping with breakpoint
@@ -302,16 +304,68 @@ static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 {
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	const struct exception_table_entry *entry;
+
+	switch(kcb->kprobe_status) {
+	case KPROBE_HIT_SS:
+	case KPROBE_REENTER:
+		/*
+		 * We are here because the instruction being single
+		 * stepped caused a page fault. We reset the current
+		 * kprobe and the tpc points back to the probe address
+		 * and allow the page fault handler to continue as a
+		 * normal page fault.
+		 */
+		regs->tpc = (unsigned long)cur->addr;
+		regs->tnpc = kcb->kprobe_orig_tnpc;
+		regs->tstate = ((regs->tstate & ~TSTATE_PIL) |
+				kcb->kprobe_orig_tstate_pil);
+		if (kcb->kprobe_status == KPROBE_REENTER)
+			restore_previous_kprobe(kcb);
+		else
+			reset_current_kprobe();
+		preempt_enable_no_resched();
+		break;
+	case KPROBE_HIT_ACTIVE:
+	case KPROBE_HIT_SSDONE:
+		/*
+		 * We increment the nmissed count for accounting,
+		 * we can also use npre/npostfault count for accouting
+		 * these specific fault cases.
+		 */
+		kprobes_inc_nmissed_count(cur);
+
+		/*
+		 * We come here because instructions in the pre/post
+		 * handler caused the page_fault, this could happen
+		 * if handler tries to access user space by
+		 * copy_from_user(), get_user() etc. Let the
+		 * user-specified handler try to fix it first.
+		 */
+		if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
+			return 1;
 
-	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
-		return 1;
+		/*
+		 * In case the user-specified fault handler returned
+		 * zero, try to fix up.
+		 */
 
-	if (kcb->kprobe_status & KPROBE_HIT_SS) {
-		resume_execution(cur, regs, kcb);
+		entry = search_exception_tables(regs->tpc);
+		if (entry) {
+			regs->tpc = entry->fixup;
+			regs->tnpc = regs->tpc + 4;
+			return 1;
+		}
 
-		reset_current_kprobe();
-		preempt_enable_no_resched();
+		/*
+		 * fixup_exception() could not handle it,
+		 * Let do_page_fault() fix it.
+		 */
+		break;
+	default:
+		break;
 	}
+
 	return 0;
 }
 
@@ -324,6 +378,9 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
+	if (args->regs && user_mode(args->regs))
+		return ret;
+
 	switch (val) {
 	case DIE_DEBUG:
 		if (kprobe_handler(args->regs))
diff --git a/arch/sparc64/kernel/ktlb.S b/arch/sparc64/kernel/ktlb.S
index d9244d3c9f73..31da1e564c95 100644
--- a/arch/sparc64/kernel/ktlb.S
+++ b/arch/sparc64/kernel/ktlb.S
@@ -4,191 +4,276 @@
  * Copyright (C) 1996 Eddie C. Dost        (ecd@brainaid.de)
  * Copyright (C) 1996 Miguel de Icaza      (miguel@nuclecu.unam.mx)
  * Copyright (C) 1996,98,99 Jakub Jelinek  (jj@sunsite.mff.cuni.cz)
-*/
+ */
 
 #include <linux/config.h>
 #include <asm/head.h>
 #include <asm/asi.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/tsb.h>
 
 	.text
 	.align		32
 
-/*
- * On a second level vpte miss, check whether the original fault is to the OBP 
- * range (note that this is only possible for instruction miss, data misses to
- * obp range do not use vpte). If so, go back directly to the faulting address.
- * This is because we want to read the tpc, otherwise we have no way of knowing
- * the 8k aligned faulting address if we are using >8k kernel pagesize. This
- * also ensures no vpte range addresses are dropped into tlb while obp is
- * executing (see inherit_locked_prom_mappings() rant).
- */
-sparc64_vpte_nucleus:
-	/* Note that kvmap below has verified that the address is
-	 * in the range MODULES_VADDR --> VMALLOC_END already.  So
-	 * here we need only check if it is an OBP address or not.
+kvmap_itlb:
+	/* g6: TAG TARGET */
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_IMMU, %g4
+
+	/* sun4v_itlb_miss branches here with the missing virtual
+	 * address already loaded into %g4
 	 */
+kvmap_itlb_4v:
+
+kvmap_itlb_nonlinear:
+	/* Catch kernel NULL pointer calls.  */
+	sethi		%hi(PAGE_SIZE), %g5
+	cmp		%g4, %g5
+	bleu,pn		%xcc, kvmap_dtlb_longpath
+	 nop
+
+	KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_itlb_load)
+
+kvmap_itlb_tsb_miss:
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kern_vpte
+	blu,pn		%xcc, kvmap_itlb_vmalloc_addr
 	 mov		0x1, %g5
 	sllx		%g5, 32, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, vpte_insn_obp
+	blu,pn		%xcc, kvmap_itlb_obp
 	 nop
 
-	/* These two instructions are patched by paginig_init().  */
-kern_vpte:
-	sethi		%hi(swapper_pgd_zero), %g5
-	lduw		[%g5 + %lo(swapper_pgd_zero)], %g5
+kvmap_itlb_vmalloc_addr:
+	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_itlb_longpath)
 
-	/* With kernel PGD in %g5, branch back into dtlb_backend.  */
-	ba,pt		%xcc, sparc64_kpte_continue
-	 andn		%g1, 0x3, %g1	/* Finish PMD offset adjustment.  */
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
 
-vpte_noent:
-	/* Restore previous TAG_ACCESS, %g5 is zero, and we will
-	 * skip over the trap instruction so that the top level
-	 * TLB miss handler will thing this %g5 value is just an
-	 * invalid PTE, thus branching to full fault processing.
-	 */
-	mov		TLB_SFSR, %g1
-	stxa		%g4, [%g1 + %g1] ASI_DMMU
-	done
-
-vpte_insn_obp:
-	/* Behave as if we are at TL0.  */
-	wrpr		%g0, 1, %tl
-	rdpr		%tpc, %g4	/* Find original faulting iaddr */
-	srlx		%g4, 13, %g4	/* Throw out context bits */
-	sllx		%g4, 13, %g4	/* g4 has vpn + ctx0 now */
-
-	/* Restore previous TAG_ACCESS.  */
-	mov		TLB_SFSR, %g1
-	stxa		%g4, [%g1 + %g1] ASI_IMMU
-
-	sethi		%hi(prom_trans), %g5
-	or		%g5, %lo(prom_trans), %g5
-
-1:	ldx		[%g5 + 0x00], %g6	! base
-	brz,a,pn	%g6, longpath		! no more entries, fail
-	 mov		TLB_SFSR, %g1		! and restore %g1
-	ldx		[%g5 + 0x08], %g1	! len
-	add		%g6, %g1, %g1		! end
-	cmp		%g6, %g4
-	bgu,pt		%xcc, 2f
-	 cmp		%g4, %g1
-	bgeu,pt		%xcc, 2f
-	 ldx		[%g5 + 0x10], %g1	! PTE
-
-	/* TLB load, restore %g1, and return from trap.  */
-	sub		%g4, %g6, %g6
-	add		%g1, %g6, %g5
-	mov		TLB_SFSR, %g1
-	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
-	retry
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	mov		1, %g7
+	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
+	brgez,a,pn	%g5, kvmap_itlb_longpath
+	 KTSB_STORE(%g1, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	/* fallthrough to TLB load */
 
-2:	ba,pt		%xcc, 1b
-	 add		%g5, (3 * 8), %g5	! next entry
-
-kvmap_do_obp:
-	sethi		%hi(prom_trans), %g5
-	or		%g5, %lo(prom_trans), %g5
-	srlx		%g4, 13, %g4
-	sllx		%g4, 13, %g4
-
-1:	ldx		[%g5 + 0x00], %g6	! base
-	brz,a,pn	%g6, longpath		! no more entries, fail
-	 mov		TLB_SFSR, %g1		! and restore %g1
-	ldx		[%g5 + 0x08], %g1	! len
-	add		%g6, %g1, %g1		! end
-	cmp		%g6, %g4
-	bgu,pt		%xcc, 2f
-	 cmp		%g4, %g1
-	bgeu,pt		%xcc, 2f
-	 ldx		[%g5 + 0x10], %g1	! PTE
-
-	/* TLB load, restore %g1, and return from trap.  */
-	sub		%g4, %g6, %g6
-	add		%g1, %g6, %g5
-	mov		TLB_SFSR, %g1
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
+kvmap_itlb_load:
+
+661:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
 	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_ITLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_itlb_load
+	 mov		%g5, %g3
 
-2:	ba,pt		%xcc, 1b
-	 add		%g5, (3 * 8), %g5	! next entry
+kvmap_itlb_longpath:
+
+661:	rdpr	%pstate, %g5
+	wrpr	%g5, PSTATE_AG | PSTATE_MG, %pstate
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	SET_GL(1)
+	nop
+	.previous
+
+	rdpr	%tpc, %g5
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_ITLB, %g4
+
+kvmap_itlb_obp:
+	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_itlb_longpath)
+
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	ba,pt		%xcc, kvmap_itlb_load
+	 nop
+
+kvmap_dtlb_obp:
+	OBP_TRANS_LOOKUP(%g4, %g5, %g2, %g3, kvmap_dtlb_longpath)
+
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	ba,pt		%xcc, kvmap_dtlb_load
+	 nop
 
-/*
- * On a first level data miss, check whether this is to the OBP range (note
- * that such accesses can be made by prom, as well as by kernel using
- * prom_getproperty on "address"), and if so, do not use vpte access ...
- * rather, use information saved during inherit_prom_mappings() using 8k
- * pagesize.
- */
 	.align		32
-kvmap:
-	brgez,pn	%g4, kvmap_nonlinear
+kvmap_dtlb_tsb4m_load:
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+	KTSB_WRITE(%g1, %g5, %g6)
+	ba,pt		%xcc, kvmap_dtlb_load
 	 nop
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
+kvmap_dtlb:
+	/* %g6: TAG TARGET */
+	mov		TLB_TAG_ACCESS, %g4
+	ldxa		[%g4] ASI_DMMU, %g4
+
+	/* sun4v_dtlb_miss branches here with the missing virtual
+	 * address already loaded into %g4
+	 */
+kvmap_dtlb_4v:
+	brgez,pn	%g4, kvmap_dtlb_nonlinear
+	 nop
+
+	/* Correct TAG_TARGET is already in %g6, check 4mb TSB.  */
+	KERN_TSB4M_LOOKUP_TL1(%g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
+
+	/* TSB entry address left in %g1, lookup linear PTE.
+	 * Must preserve %g1 and %g6 (TAG).
+	 */
+kvmap_dtlb_tsb4m_miss:
+	sethi		%hi(kpte_linear_bitmap), %g2
+	or		%g2, %lo(kpte_linear_bitmap), %g2
+
+	/* Clear the PAGE_OFFSET top virtual bits, then shift
+	 * down to get a 256MB physical address index.
+	 */
+	sllx		%g4, 21, %g5
+	mov		1, %g7
+	srlx		%g5, 21 + 28, %g5
+
+	/* Don't try this at home kids... this depends upon srlx
+	 * only taking the low 6 bits of the shift count in %g5.
+	 */
+	sllx		%g7, %g5, %g7
+
+	/* Divide by 64 to get the offset into the bitmask.  */
+	srlx		%g5, 6, %g5
+	sllx		%g5, 3, %g5
+
+	/* kern_linear_pte_xor[((mask & bit) ? 1 : 0)] */
+	ldx		[%g2 + %g5], %g2
+	andcc		%g2, %g7, %g0
+	sethi		%hi(kern_linear_pte_xor), %g5
+	or		%g5, %lo(kern_linear_pte_xor), %g5
+	bne,a,pt	%xcc, 1f
+	 add		%g5, 8, %g5
+
+1:	ldx		[%g5], %g2
+
 	.globl		kvmap_linear_patch
 kvmap_linear_patch:
-#endif
-	ba,pt		%xcc, kvmap_load
+	ba,pt		%xcc, kvmap_dtlb_tsb4m_load
 	 xor		%g2, %g4, %g5
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	sethi		%hi(swapper_pg_dir), %g5
-	or		%g5, %lo(swapper_pg_dir), %g5
-	sllx		%g4, 64 - (PGDIR_SHIFT + PGDIR_BITS), %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	andn		%g6, 0x3, %g6
-	lduw		[%g5 + %g6], %g5
-	brz,pn		%g5, longpath
-	 sllx		%g4, 64 - (PMD_SHIFT + PMD_BITS), %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	sllx		%g5, 11, %g5
-	andn		%g6, 0x3, %g6
-	lduwa		[%g5 + %g6] ASI_PHYS_USE_EC, %g5
-	brz,pn		%g5, longpath
-	 sllx		%g4, 64 - PMD_SHIFT, %g6
-	srlx		%g6, 64 - PAGE_SHIFT, %g6
-	sllx		%g5, 11, %g5
-	andn		%g6, 0x7, %g6
-	ldxa		[%g5 + %g6] ASI_PHYS_USE_EC, %g5
-	brz,pn		%g5, longpath
+kvmap_dtlb_vmalloc_addr:
+	KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_dtlb_longpath)
+
+	KTSB_LOCK_TAG(%g1, %g2, %g7)
+
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	mov		1, %g7
+	sllx		%g7, TSB_TAG_INVALID_BIT, %g7
+	brgez,a,pn	%g5, kvmap_dtlb_longpath
+	 KTSB_STORE(%g1, %g7)
+
+	KTSB_WRITE(%g1, %g5, %g6)
+
+	/* fallthrough to TLB load */
+
+kvmap_dtlb_load:
+
+661:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
+	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_DTLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_dtlb_load
+	 mov		%g5, %g3
+
+kvmap_dtlb_nonlinear:
+	/* Catch kernel NULL pointer derefs.  */
+	sethi		%hi(PAGE_SIZE), %g5
+	cmp		%g4, %g5
+	bleu,pn		%xcc, kvmap_dtlb_longpath
 	 nop
-	ba,a,pt		%xcc, kvmap_load
-#endif
 
-kvmap_nonlinear:
+	KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
+
+kvmap_dtlb_tsbmiss:
 	sethi		%hi(MODULES_VADDR), %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, longpath
+	blu,pn		%xcc, kvmap_dtlb_longpath
 	 mov		(VMALLOC_END >> 24), %g5
 	sllx		%g5, 24, %g5
 	cmp		%g4, %g5
-	bgeu,pn		%xcc, longpath
+	bgeu,pn		%xcc, kvmap_dtlb_longpath
 	 nop
 
 kvmap_check_obp:
 	sethi		%hi(LOW_OBP_ADDRESS), %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kvmap_vmalloc_addr
+	blu,pn		%xcc, kvmap_dtlb_vmalloc_addr
 	 mov		0x1, %g5
 	sllx		%g5, 32, %g5
 	cmp		%g4, %g5
-	blu,pn		%xcc, kvmap_do_obp
+	blu,pn		%xcc, kvmap_dtlb_obp
 	 nop
-
-kvmap_vmalloc_addr:
-	/* If we get here, a vmalloc addr was accessed, load kernel VPTE.  */
-	ldxa		[%g3 + %g6] ASI_N, %g5
-	brgez,pn	%g5, longpath
+	ba,pt		%xcc, kvmap_dtlb_vmalloc_addr
 	 nop
 
-kvmap_load:
-	/* PTE is valid, load into TLB and return from trap.  */
-	stxa		%g5, [%g0] ASI_DTLB_DATA_IN	! Reload TLB
-	retry
+kvmap_dtlb_longpath:
+
+661:	rdpr	%pstate, %g5
+	wrpr	%g5, PSTATE_AG | PSTATE_MG, %pstate
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	SET_GL(1)
+	ldxa		[%g0] ASI_SCRATCHPAD, %g5
+	.previous
+
+	rdpr	%tl, %g3
+	cmp	%g3, 1
+
+661:	mov	TLB_TAG_ACCESS, %g4
+	ldxa	[%g4] ASI_DMMU, %g5
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	ldx	[%g5 + HV_FAULT_D_ADDR_OFFSET], %g5
+	nop
+	.previous
+
+	be,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB, %g4
+	ba,pt	%xcc, winfix_trampoline
+	 nop
diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c
index 2ff7c32ab0ce..dfccff29e182 100644
--- a/arch/sparc64/kernel/pci.c
+++ b/arch/sparc64/kernel/pci.c
@@ -188,6 +188,7 @@ extern void psycho_init(int, char *);
 extern void schizo_init(int, char *);
 extern void schizo_plus_init(int, char *);
 extern void tomatillo_init(int, char *);
+extern void sun4v_pci_init(int, char *);
 
 static struct {
 	char *model_name;
@@ -204,6 +205,7 @@ static struct {
 	{ "pci108e,8002", schizo_plus_init },
 	{ "SUNW,tomatillo", tomatillo_init },
 	{ "pci108e,a801", tomatillo_init },
+	{ "SUNW,sun4v-pci", sun4v_pci_init },
 };
 #define PCI_NUM_CONTROLLER_TYPES (sizeof(pci_controller_table) / \
 				  sizeof(pci_controller_table[0]))
@@ -283,6 +285,12 @@ int __init pcic_present(void)
 	return pci_controller_scan(pci_is_controller);
 }
 
+struct pci_iommu_ops *pci_iommu_ops;
+EXPORT_SYMBOL(pci_iommu_ops);
+
+extern struct pci_iommu_ops pci_sun4u_iommu_ops,
+	pci_sun4v_iommu_ops;
+
 /* Find each controller in the system, attach and initialize
  * software state structure for each and link into the
  * pci_controller_root.  Setup the controller enough such
@@ -290,6 +298,11 @@ int __init pcic_present(void)
  */
 static void __init pci_controller_probe(void)
 {
+	if (tlb_type == hypervisor)
+		pci_iommu_ops = &pci_sun4v_iommu_ops;
+	else
+		pci_iommu_ops = &pci_sun4u_iommu_ops;
+
 	printk("PCI: Probing for controllers.\n");
 
 	pci_controller_scan(pci_controller_init);
@@ -643,6 +656,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	__pci_mmap_set_flags(dev, vma, mmap_state);
 	__pci_mmap_set_pgprot(dev, vma, mmap_state);
 
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	ret = io_remap_pfn_range(vma, vma->vm_start,
 				 vma->vm_pgoff,
 				 vma->vm_end - vma->vm_start,
@@ -650,7 +664,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	if (ret)
 		return ret;
 
-	vma->vm_flags |= VM_IO;
 	return 0;
 }
 
diff --git a/arch/sparc64/kernel/pci_common.c b/arch/sparc64/kernel/pci_common.c
index 58310aacea28..33dedb1aacd4 100644
--- a/arch/sparc64/kernel/pci_common.c
+++ b/arch/sparc64/kernel/pci_common.c
@@ -39,6 +39,8 @@ static int __init find_device_prom_node(struct pci_pbm_info *pbm,
 {
 	int node;
 
+	*nregs = 0;
+
 	/*
 	 * Return the PBM's PROM node in case we are it's PCI device,
 	 * as the PBM's reg property is different to standard PCI reg
@@ -51,10 +53,8 @@ static int __init find_device_prom_node(struct pci_pbm_info *pbm,
 	     pdev->device == PCI_DEVICE_ID_SUN_SCHIZO ||
 	     pdev->device == PCI_DEVICE_ID_SUN_TOMATILLO ||
 	     pdev->device == PCI_DEVICE_ID_SUN_SABRE ||
-	     pdev->device == PCI_DEVICE_ID_SUN_HUMMINGBIRD)) {
-		*nregs = 0;
+	     pdev->device == PCI_DEVICE_ID_SUN_HUMMINGBIRD))
 		return bus_prom_node;
-	}
 
 	node = prom_getchild(bus_prom_node);
 	while (node != 0) {
@@ -541,135 +541,183 @@ void __init pci_assign_unassigned(struct pci_pbm_info *pbm,
 		pci_assign_unassigned(pbm, bus);
 }
 
-static int __init pci_intmap_match(struct pci_dev *pdev, unsigned int *interrupt)
+static inline unsigned int pci_slot_swivel(struct pci_pbm_info *pbm,
+					   struct pci_dev *toplevel_pdev,
+					   struct pci_dev *pdev,
+					   unsigned int interrupt)
 {
-	struct linux_prom_pci_intmap bridge_local_intmap[PROM_PCIIMAP_MAX], *intmap;
-	struct linux_prom_pci_intmask bridge_local_intmask, *intmask;
-	struct pcidev_cookie *dev_pcp = pdev->sysdata;
-	struct pci_pbm_info *pbm = dev_pcp->pbm;
-	struct linux_prom_pci_registers *pregs = dev_pcp->prom_regs;
-	unsigned int hi, mid, lo, irq;
-	int i, num_intmap, map_slot;
+	unsigned int ret;
 
-	intmap = &pbm->pbm_intmap[0];
-	intmask = &pbm->pbm_intmask;
-	num_intmap = pbm->num_pbm_intmap;
-	map_slot = 0;
+	if (unlikely(interrupt < 1 || interrupt > 4)) {
+		printk("%s: Device %s interrupt value of %u is strange.\n",
+		       pbm->name, pci_name(pdev), interrupt);
+		return interrupt;
+	}
 
-	/* If we are underneath a PCI bridge, use PROM register
-	 * property of the parent bridge which is closest to
-	 * the PBM.
-	 *
-	 * However if that parent bridge has interrupt map/mask
-	 * properties of its own we use the PROM register property
-	 * of the next child device on the path to PDEV.
-	 *
-	 * In detail the two cases are (note that the 'X' below is the
-	 * 'next child on the path to PDEV' mentioned above):
-	 *
-	 * 1) PBM --> PCI bus lacking int{map,mask} --> X ... PDEV
-	 *
-	 *    Here we use regs of 'PCI bus' device.
-	 *
-	 * 2) PBM --> PCI bus with int{map,mask} --> X ... PDEV
-	 *
-	 *    Here we use regs of 'X'.  Note that X can be PDEV.
-	 */
-	if (pdev->bus->number != pbm->pci_first_busno) {
-		struct pcidev_cookie *bus_pcp, *regs_pcp;
-		struct pci_dev *bus_dev, *regs_dev;
-		int plen;
+	ret = ((interrupt - 1 + (PCI_SLOT(pdev->devfn) & 3)) & 3) + 1;
+
+	printk("%s: %s IRQ Swivel %s [%x:%x] -> [%x]\n",
+	       pbm->name, pci_name(toplevel_pdev), pci_name(pdev),
+	       interrupt, PCI_SLOT(pdev->devfn), ret);
+
+	return ret;
+}
+
+static inline unsigned int pci_apply_intmap(struct pci_pbm_info *pbm,
+					    struct pci_dev *toplevel_pdev,
+					    struct pci_dev *pbus,
+					    struct pci_dev *pdev,
+					    unsigned int interrupt,
+					    unsigned int *cnode)
+{
+	struct linux_prom_pci_intmap imap[PROM_PCIIMAP_MAX];
+	struct linux_prom_pci_intmask imask;
+	struct pcidev_cookie *pbus_pcp = pbus->sysdata;
+	struct pcidev_cookie *pdev_pcp = pdev->sysdata;
+	struct linux_prom_pci_registers *pregs = pdev_pcp->prom_regs;
+	int plen, num_imap, i;
+	unsigned int hi, mid, lo, irq, orig_interrupt;
+
+	*cnode = pbus_pcp->prom_node;
+
+	plen = prom_getproperty(pbus_pcp->prom_node, "interrupt-map",
+				(char *) &imap[0], sizeof(imap));
+	if (plen <= 0 ||
+	    (plen % sizeof(struct linux_prom_pci_intmap)) != 0) {
+		printk("%s: Device %s interrupt-map has bad len %d\n",
+		       pbm->name, pci_name(pbus), plen);
+		goto no_intmap;
+	}
+	num_imap = plen / sizeof(struct linux_prom_pci_intmap);
+
+	plen = prom_getproperty(pbus_pcp->prom_node, "interrupt-map-mask",
+				(char *) &imask, sizeof(imask));
+	if (plen <= 0 ||
+	    (plen % sizeof(struct linux_prom_pci_intmask)) != 0) {
+		printk("%s: Device %s interrupt-map-mask has bad len %d\n",
+		       pbm->name, pci_name(pbus), plen);
+		goto no_intmap;
+	}
+
+	orig_interrupt = interrupt;
 
-		bus_dev = pdev->bus->self;
-		regs_dev = pdev;
+	hi   = pregs->phys_hi & imask.phys_hi;
+	mid  = pregs->phys_mid & imask.phys_mid;
+	lo   = pregs->phys_lo & imask.phys_lo;
+	irq  = interrupt & imask.interrupt;
 
-		while (bus_dev->bus &&
-		       bus_dev->bus->number != pbm->pci_first_busno) {
-			regs_dev = bus_dev;
-			bus_dev = bus_dev->bus->self;
+	for (i = 0; i < num_imap; i++) {
+		if (imap[i].phys_hi  == hi   &&
+		    imap[i].phys_mid == mid  &&
+		    imap[i].phys_lo  == lo   &&
+		    imap[i].interrupt == irq) {
+			*cnode = imap[i].cnode;
+			interrupt = imap[i].cinterrupt;
 		}
+	}
 
-		regs_pcp = regs_dev->sysdata;
-		pregs = regs_pcp->prom_regs;
+	printk("%s: %s MAP BUS %s DEV %s [%x] -> [%x]\n",
+	       pbm->name, pci_name(toplevel_pdev),
+	       pci_name(pbus), pci_name(pdev),
+	       orig_interrupt, interrupt);
 
-		bus_pcp = bus_dev->sysdata;
+no_intmap:
+	return interrupt;
+}
 
-		/* But if the PCI bridge has it's own interrupt map
-		 * and mask properties, use that and the regs of the
-		 * PCI entity at the next level down on the path to the
-		 * device.
-		 */
-		plen = prom_getproperty(bus_pcp->prom_node, "interrupt-map",
-					(char *) &bridge_local_intmap[0],
-					sizeof(bridge_local_intmap));
-		if (plen != -1) {
-			intmap = &bridge_local_intmap[0];
-			num_intmap = plen / sizeof(struct linux_prom_pci_intmap);
-			plen = prom_getproperty(bus_pcp->prom_node,
-						"interrupt-map-mask",
-						(char *) &bridge_local_intmask,
-						sizeof(bridge_local_intmask));
-			if (plen == -1) {
-				printk("pci_intmap_match: Warning! Bridge has intmap "
-				       "but no intmask.\n");
-				printk("pci_intmap_match: Trying to recover.\n");
-				return 0;
-			}
+/* For each PCI bus on the way to the root:
+ * 1) If it has an interrupt-map property, apply it.
+ * 2) Else, swivel the interrupt number based upon the PCI device number.
+ *
+ * Return the "IRQ controller" node.  If this is the PBM's device node,
+ * all interrupt translations are complete, else we should use that node's
+ * "reg" property to apply the PBM's "interrupt-{map,mask}" to the interrupt.
+ */
+static unsigned int __init pci_intmap_match_to_root(struct pci_pbm_info *pbm,
+						    struct pci_dev *pdev,
+						    unsigned int *interrupt)
+{
+	struct pci_dev *toplevel_pdev = pdev;
+	struct pcidev_cookie *toplevel_pcp = toplevel_pdev->sysdata;
+	unsigned int cnode = toplevel_pcp->prom_node;
+
+	while (pdev->bus->number != pbm->pci_first_busno) {
+		struct pci_dev *pbus = pdev->bus->self;
+		struct pcidev_cookie *pcp = pbus->sysdata;
+		int plen;
 
-			if (pdev->bus->self != bus_dev)
-				map_slot = 1;
+		plen = prom_getproplen(pcp->prom_node, "interrupt-map");
+		if (plen <= 0) {
+			*interrupt = pci_slot_swivel(pbm, toplevel_pdev,
+						     pdev, *interrupt);
+			cnode = pcp->prom_node;
 		} else {
-			pregs = bus_pcp->prom_regs;
-			map_slot = 1;
+			*interrupt = pci_apply_intmap(pbm, toplevel_pdev,
+						      pbus, pdev,
+						      *interrupt, &cnode);
+
+			while (pcp->prom_node != cnode &&
+			       pbus->bus->number != pbm->pci_first_busno) {
+				pbus = pbus->bus->self;
+				pcp = pbus->sysdata;
+			}
 		}
-	}
+		pdev = pbus;
 
-	if (map_slot) {
-		*interrupt = ((*interrupt
-			       - 1
-			       + PCI_SLOT(pdev->devfn)) & 0x3) + 1;
+		if (cnode == pbm->prom_node)
+			break;
 	}
 
-	hi   = pregs->phys_hi & intmask->phys_hi;
-	mid  = pregs->phys_mid & intmask->phys_mid;
-	lo   = pregs->phys_lo & intmask->phys_lo;
-	irq  = *interrupt & intmask->interrupt;
-
-	for (i = 0; i < num_intmap; i++) {
-		if (intmap[i].phys_hi  == hi	&&
-		    intmap[i].phys_mid == mid	&&
-		    intmap[i].phys_lo  == lo	&&
-		    intmap[i].interrupt == irq) {
-			*interrupt = intmap[i].cinterrupt;
-			printk("PCI-IRQ: Routing bus[%2x] slot[%2x] map[%d] to INO[%02x]\n",
-			       pdev->bus->number, PCI_SLOT(pdev->devfn),
-			       map_slot, *interrupt);
-			return 1;
-		}
+	return cnode;
+}
+
+static int __init pci_intmap_match(struct pci_dev *pdev, unsigned int *interrupt)
+{
+	struct pcidev_cookie *dev_pcp = pdev->sysdata;
+	struct pci_pbm_info *pbm = dev_pcp->pbm;
+	struct linux_prom_pci_registers reg[PROMREG_MAX];
+	unsigned int hi, mid, lo, irq;
+	int i, cnode, plen;
+
+	cnode = pci_intmap_match_to_root(pbm, pdev, interrupt);
+	if (cnode == pbm->prom_node)
+		goto success;
+
+	plen = prom_getproperty(cnode, "reg", (char *) reg, sizeof(reg));
+	if (plen <= 0 ||
+	    (plen % sizeof(struct linux_prom_pci_registers)) != 0) {
+		printk("%s: OBP node %x reg property has bad len %d\n",
+		       pbm->name, cnode, plen);
+		goto fail;
 	}
 
-	/* We will run this code even if pbm->num_pbm_intmap is zero, just so
-	 * we can apply the slot mapping to the PROM interrupt property value.
-	 * So do not spit out these warnings in that case.
-	 */
-	if (num_intmap != 0) {
-		/* Print it both to OBP console and kernel one so that if bootup
-		 * hangs here the user has the information to report.
-		 */
-		prom_printf("pci_intmap_match: bus %02x, devfn %02x: ",
-			    pdev->bus->number, pdev->devfn);
-		prom_printf("IRQ [%08x.%08x.%08x.%08x] not found in interrupt-map\n",
-			    pregs->phys_hi, pregs->phys_mid, pregs->phys_lo, *interrupt);
-		prom_printf("Please email this information to davem@redhat.com\n");
-
-		printk("pci_intmap_match: bus %02x, devfn %02x: ",
-		       pdev->bus->number, pdev->devfn);
-		printk("IRQ [%08x.%08x.%08x.%08x] not found in interrupt-map\n",
-		       pregs->phys_hi, pregs->phys_mid, pregs->phys_lo, *interrupt);
-		printk("Please email this information to davem@redhat.com\n");
+	hi   = reg[0].phys_hi & pbm->pbm_intmask.phys_hi;
+	mid  = reg[0].phys_mid & pbm->pbm_intmask.phys_mid;
+	lo   = reg[0].phys_lo & pbm->pbm_intmask.phys_lo;
+	irq  = *interrupt & pbm->pbm_intmask.interrupt;
+
+	for (i = 0; i < pbm->num_pbm_intmap; i++) {
+		struct linux_prom_pci_intmap *intmap;
+
+		intmap = &pbm->pbm_intmap[i];
+
+		if (intmap->phys_hi  == hi  &&
+		    intmap->phys_mid == mid &&
+		    intmap->phys_lo  == lo  &&
+		    intmap->interrupt == irq) {
+			*interrupt = intmap->cinterrupt;
+			goto success;
+		}
 	}
 
+fail:
 	return 0;
+
+success:
+	printk("PCI-IRQ: Routing bus[%2x] slot[%2x] to INO[%02x]\n",
+	       pdev->bus->number, PCI_SLOT(pdev->devfn),
+	       *interrupt);
+	return 1;
 }
 
 static void __init pdev_fixup_irq(struct pci_dev *pdev)
@@ -703,16 +751,18 @@ static void __init pdev_fixup_irq(struct pci_dev *pdev)
 		return;
 	}
 
-	/* Fully specified already? */
-	if (((prom_irq & PCI_IRQ_IGN) >> 6) == portid) {
-		pdev->irq = p->irq_build(pbm, pdev, prom_irq);
-		goto have_irq;
-	}
+	if (tlb_type != hypervisor) {
+		/* Fully specified already? */
+		if (((prom_irq & PCI_IRQ_IGN) >> 6) == portid) {
+			pdev->irq = p->irq_build(pbm, pdev, prom_irq);
+			goto have_irq;
+		}
 
-	/* An onboard device? (bit 5 set) */
-	if ((prom_irq & PCI_IRQ_INO) & 0x20) {
-		pdev->irq = p->irq_build(pbm, pdev, (portid << 6 | prom_irq));
-		goto have_irq;
+		/* An onboard device? (bit 5 set) */
+		if ((prom_irq & PCI_IRQ_INO) & 0x20) {
+			pdev->irq = p->irq_build(pbm, pdev, (portid << 6 | prom_irq));
+			goto have_irq;
+		}
 	}
 
 	/* Can we find a matching entry in the interrupt-map? */
@@ -927,33 +977,30 @@ void pci_register_legacy_regions(struct resource *io_res,
 	struct resource *p;
 
 	/* VGA Video RAM. */
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return;
 
-	memset(p, 0, sizeof(*p));
 	p->name = "Video RAM area";
 	p->start = mem_res->start + 0xa0000UL;
 	p->end = p->start + 0x1ffffUL;
 	p->flags = IORESOURCE_BUSY;
 	request_resource(mem_res, p);
 
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return;
 
-	memset(p, 0, sizeof(*p));
 	p->name = "System ROM";
 	p->start = mem_res->start + 0xf0000UL;
 	p->end = p->start + 0xffffUL;
 	p->flags = IORESOURCE_BUSY;
 	request_resource(mem_res, p);
 
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
 		return;
 
-	memset(p, 0, sizeof(*p));
 	p->name = "Video ROM";
 	p->start = mem_res->start + 0xc0000UL;
 	p->end = p->start + 0x7fffUL;
diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c
index a11910be1013..8efbc139769d 100644
--- a/arch/sparc64/kernel/pci_iommu.c
+++ b/arch/sparc64/kernel/pci_iommu.c
@@ -139,12 +139,11 @@ void pci_iommu_table_init(struct pci_iommu *iommu, int tsbsize, u32 dma_offset,
 	/* Allocate and initialize the free area map.  */
 	sz = num_tsb_entries / 8;
 	sz = (sz + 7UL) & ~7UL;
-	iommu->arena.map = kmalloc(sz, GFP_KERNEL);
+	iommu->arena.map = kzalloc(sz, GFP_KERNEL);
 	if (!iommu->arena.map) {
 		prom_printf("PCI_IOMMU: Error, kmalloc(arena.map) failed.\n");
 		prom_halt();
 	}
-	memset(iommu->arena.map, 0, sz);
 	iommu->arena.limit = num_tsb_entries;
 
 	/* Allocate and initialize the dummy page which we
@@ -219,7 +218,7 @@ static inline void iommu_free_ctx(struct pci_iommu *iommu, int ctx)
  * DMA for PCI device PDEV.  Return non-NULL cpu-side address if
  * successful and set *DMA_ADDRP to the PCI side dma address.
  */
-void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp)
+static void *pci_4u_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -267,7 +266,7 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_ad
 }
 
 /* Free and unmap a consistent DMA translation. */
-void pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_t dvma)
+static void pci_4u_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_t dvma)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -294,7 +293,7 @@ void pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_
 /* Map a single buffer at PTR of SZ bytes for PCI DMA
  * in streaming mode.
  */
-dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direction)
+static dma_addr_t pci_4u_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direction)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -415,7 +414,7 @@ do_flush_sync:
 }
 
 /* Unmap a single streaming mode DMA translation. */
-void pci_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+static void pci_4u_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -548,7 +547,7 @@ static inline void fill_sg(iopte_t *iopte, struct scatterlist *sg,
  * When making changes here, inspect the assembly output. I was having
  * hard time to kepp this routine out of using stack slots for holding variables.
  */
-int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+static int pci_4u_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -562,9 +561,9 @@ int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int
 	/* Fast path single entry scatterlists. */
 	if (nelems == 1) {
 		sglist->dma_address =
-			pci_map_single(pdev,
-				       (page_address(sglist->page) + sglist->offset),
-				       sglist->length, direction);
+			pci_4u_map_single(pdev,
+					  (page_address(sglist->page) + sglist->offset),
+					  sglist->length, direction);
 		if (unlikely(sglist->dma_address == PCI_DMA_ERROR_CODE))
 			return 0;
 		sglist->dma_length = sglist->length;
@@ -635,7 +634,7 @@ bad_no_ctx:
 }
 
 /* Unmap a set of streaming mode DMA translations. */
-void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+static void pci_4u_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -695,7 +694,7 @@ void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems,
 /* Make physical memory consistent for a single
  * streaming mode DMA translation after a transfer.
  */
-void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+static void pci_4u_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -735,7 +734,7 @@ void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size
 /* Make physical memory consistent for a set of streaming
  * mode DMA translations after a transfer.
  */
-void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+static void pci_4u_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
 {
 	struct pcidev_cookie *pcp;
 	struct pci_iommu *iommu;
@@ -776,6 +775,17 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, i
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
+struct pci_iommu_ops pci_sun4u_iommu_ops = {
+	.alloc_consistent		= pci_4u_alloc_consistent,
+	.free_consistent		= pci_4u_free_consistent,
+	.map_single			= pci_4u_map_single,
+	.unmap_single			= pci_4u_unmap_single,
+	.map_sg				= pci_4u_map_sg,
+	.unmap_sg			= pci_4u_unmap_sg,
+	.dma_sync_single_for_cpu	= pci_4u_dma_sync_single_for_cpu,
+	.dma_sync_sg_for_cpu		= pci_4u_dma_sync_sg_for_cpu,
+};
+
 static void ali_sound_dma_hack(struct pci_dev *pdev, int set_bit)
 {
 	struct pci_dev *ali_isa_bridge;
diff --git a/arch/sparc64/kernel/pci_psycho.c b/arch/sparc64/kernel/pci_psycho.c
index c03ed5f49d31..d17878b145c2 100644
--- a/arch/sparc64/kernel/pci_psycho.c
+++ b/arch/sparc64/kernel/pci_psycho.c
@@ -286,17 +286,17 @@ static unsigned char psycho_pil_table[] = {
 /*0x14*/0, 0, 0, 0,	/* PCI B slot 1  Int A, B, C, D */
 /*0x18*/0, 0, 0, 0,	/* PCI B slot 2  Int A, B, C, D */
 /*0x1c*/0, 0, 0, 0,	/* PCI B slot 3  Int A, B, C, D */
-/*0x20*/4,		/* SCSI				*/
+/*0x20*/5,		/* SCSI				*/
 /*0x21*/5,		/* Ethernet			*/
 /*0x22*/8,		/* Parallel Port		*/
 /*0x23*/13,		/* Audio Record			*/
 /*0x24*/14,		/* Audio Playback		*/
 /*0x25*/15,		/* PowerFail			*/
-/*0x26*/4,		/* second SCSI			*/
+/*0x26*/5,		/* second SCSI			*/
 /*0x27*/11,		/* Floppy			*/
-/*0x28*/4,		/* Spare Hardware		*/
+/*0x28*/5,		/* Spare Hardware		*/
 /*0x29*/9,		/* Keyboard			*/
-/*0x2a*/4,		/* Mouse			*/
+/*0x2a*/5,		/* Mouse			*/
 /*0x2b*/12,		/* Serial			*/
 /*0x2c*/10,		/* Timer 0			*/
 /*0x2d*/11,		/* Timer 1			*/
@@ -313,11 +313,11 @@ static int psycho_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 
 	ret = psycho_pil_table[ino];
 	if (ret == 0 && pdev == NULL) {
-		ret = 4;
+		ret = 5;
 	} else if (ret == 0) {
 		switch ((pdev->class >> 16) & 0xff) {
 		case PCI_BASE_CLASS_STORAGE:
-			ret = 4;
+			ret = 5;
 			break;
 
 		case PCI_BASE_CLASS_NETWORK:
@@ -336,7 +336,7 @@ static int psycho_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 			break;
 
 		default:
-			ret = 4;
+			ret = 5;
 			break;
 		};
 	}
@@ -1164,7 +1164,7 @@ static void pbm_config_busmastering(struct pci_pbm_info *pbm)
 static void pbm_scan_bus(struct pci_controller_info *p,
 			 struct pci_pbm_info *pbm)
 {
-	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+	struct pcidev_cookie *cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
 
 	if (!cookie) {
 		prom_printf("PSYCHO: Critical allocation failure.\n");
@@ -1172,7 +1172,6 @@ static void pbm_scan_bus(struct pci_controller_info *p,
 	}
 
 	/* All we care about is the PBM. */
-	memset(cookie, 0, sizeof(*cookie));
 	cookie->pbm = pbm;
 
 	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno,
@@ -1465,18 +1464,16 @@ void psycho_init(int node, char *model_name)
 		}
 	}
 
-	p = kmalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
+	p = kzalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
 	if (!p) {
 		prom_printf("PSYCHO: Fatal memory allocation error.\n");
 		prom_halt();
 	}
-	memset(p, 0, sizeof(*p));
-	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
 	if (!iommu) {
 		prom_printf("PSYCHO: Fatal memory allocation error.\n");
 		prom_halt();
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_A.iommu = p->pbm_B.iommu = iommu;
 
 	p->next = pci_controller_root;
diff --git a/arch/sparc64/kernel/pci_sabre.c b/arch/sparc64/kernel/pci_sabre.c
index da8e1364194f..f67bb7f078cf 100644
--- a/arch/sparc64/kernel/pci_sabre.c
+++ b/arch/sparc64/kernel/pci_sabre.c
@@ -533,17 +533,17 @@ static unsigned char sabre_pil_table[] = {
 /*0x14*/0, 0, 0, 0,	/* PCI B slot 1  Int A, B, C, D */
 /*0x18*/0, 0, 0, 0,	/* PCI B slot 2  Int A, B, C, D */
 /*0x1c*/0, 0, 0, 0,	/* PCI B slot 3  Int A, B, C, D */
-/*0x20*/4,		/* SCSI				*/
+/*0x20*/5,		/* SCSI				*/
 /*0x21*/5,		/* Ethernet			*/
 /*0x22*/8,		/* Parallel Port		*/
 /*0x23*/13,		/* Audio Record			*/
 /*0x24*/14,		/* Audio Playback		*/
 /*0x25*/15,		/* PowerFail			*/
-/*0x26*/4,		/* second SCSI			*/
+/*0x26*/5,		/* second SCSI			*/
 /*0x27*/11,		/* Floppy			*/
-/*0x28*/4,		/* Spare Hardware		*/
+/*0x28*/5,		/* Spare Hardware		*/
 /*0x29*/9,		/* Keyboard			*/
-/*0x2a*/4,		/* Mouse			*/
+/*0x2a*/5,		/* Mouse			*/
 /*0x2b*/12,		/* Serial			*/
 /*0x2c*/10,		/* Timer 0			*/
 /*0x2d*/11,		/* Timer 1			*/
@@ -565,11 +565,11 @@ static int sabre_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 
 	ret = sabre_pil_table[ino];
 	if (ret == 0 && pdev == NULL) {
-		ret = 4;
+		ret = 5;
 	} else if (ret == 0) {
 		switch ((pdev->class >> 16) & 0xff) {
 		case PCI_BASE_CLASS_STORAGE:
-			ret = 4;
+			ret = 5;
 			break;
 
 		case PCI_BASE_CLASS_NETWORK:
@@ -588,7 +588,7 @@ static int sabre_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 			break;
 
 		default:
-			ret = 4;
+			ret = 5;
 			break;
 		};
 	}
@@ -1167,7 +1167,7 @@ static void apb_init(struct pci_controller_info *p, struct pci_bus *sabre_bus)
 
 static struct pcidev_cookie *alloc_bridge_cookie(struct pci_pbm_info *pbm)
 {
-	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+	struct pcidev_cookie *cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
 
 	if (!cookie) {
 		prom_printf("SABRE: Critical allocation failure.\n");
@@ -1175,7 +1175,6 @@ static struct pcidev_cookie *alloc_bridge_cookie(struct pci_pbm_info *pbm)
 	}
 
 	/* All we care about is the PBM. */
-	memset(cookie, 0, sizeof(*cookie));
 	cookie->pbm = pbm;
 
 	return cookie;
@@ -1556,19 +1555,17 @@ void sabre_init(int pnode, char *model_name)
 		}
 	}
 
-	p = kmalloc(sizeof(*p), GFP_ATOMIC);
+	p = kzalloc(sizeof(*p), GFP_ATOMIC);
 	if (!p) {
 		prom_printf("SABRE: Error, kmalloc(pci_controller_info) failed.\n");
 		prom_halt();
 	}
-	memset(p, 0, sizeof(*p));
 
-	iommu = kmalloc(sizeof(*iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(*iommu), GFP_ATOMIC);
 	if (!iommu) {
 		prom_printf("SABRE: Error, kmalloc(pci_iommu) failed.\n");
 		prom_halt();
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_A.iommu = p->pbm_B.iommu = iommu;
 
 	upa_portid = prom_getintdefault(pnode, "upa-portid", 0xff);
diff --git a/arch/sparc64/kernel/pci_schizo.c b/arch/sparc64/kernel/pci_schizo.c
index d8c4e0919b4e..7fe4de03ac2e 100644
--- a/arch/sparc64/kernel/pci_schizo.c
+++ b/arch/sparc64/kernel/pci_schizo.c
@@ -243,8 +243,8 @@ static unsigned char schizo_pil_table[] = {
 /*0x0c*/0, 0, 0, 0,	/* PCI slot 3  Int A, B, C, D	*/
 /*0x10*/0, 0, 0, 0,	/* PCI slot 4  Int A, B, C, D	*/
 /*0x14*/0, 0, 0, 0,	/* PCI slot 5  Int A, B, C, D	*/
-/*0x18*/4,		/* SCSI				*/
-/*0x19*/4,		/* second SCSI			*/
+/*0x18*/5,		/* SCSI				*/
+/*0x19*/5,		/* second SCSI			*/
 /*0x1a*/0,		/* UNKNOWN			*/
 /*0x1b*/0,		/* UNKNOWN			*/
 /*0x1c*/8,		/* Parallel			*/
@@ -254,7 +254,7 @@ static unsigned char schizo_pil_table[] = {
 /*0x20*/13,		/* Audio Record			*/
 /*0x21*/14,		/* Audio Playback		*/
 /*0x22*/12,		/* Serial			*/
-/*0x23*/4,		/* EBUS I2C 			*/
+/*0x23*/5,		/* EBUS I2C 			*/
 /*0x24*/10,		/* RTC Clock			*/
 /*0x25*/11,		/* Floppy			*/
 /*0x26*/0,		/* UNKNOWN			*/
@@ -296,11 +296,11 @@ static int schizo_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 
 	ret = schizo_pil_table[ino];
 	if (ret == 0 && pdev == NULL) {
-		ret = 4;
+		ret = 5;
 	} else if (ret == 0) {
 		switch ((pdev->class >> 16) & 0xff) {
 		case PCI_BASE_CLASS_STORAGE:
-			ret = 4;
+			ret = 5;
 			break;
 
 		case PCI_BASE_CLASS_NETWORK:
@@ -319,7 +319,7 @@ static int schizo_ino_to_pil(struct pci_dev *pdev, unsigned int ino)
 			break;
 
 		default:
-			ret = 4;
+			ret = 5;
 			break;
 		};
 	}
@@ -1525,7 +1525,7 @@ static void pbm_config_busmastering(struct pci_pbm_info *pbm)
 static void pbm_scan_bus(struct pci_controller_info *p,
 			 struct pci_pbm_info *pbm)
 {
-	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+	struct pcidev_cookie *cookie = kzalloc(sizeof(*cookie), GFP_KERNEL);
 
 	if (!cookie) {
 		prom_printf("%s: Critical allocation failure.\n", pbm->name);
@@ -1533,7 +1533,6 @@ static void pbm_scan_bus(struct pci_controller_info *p,
 	}
 
 	/* All we care about is the PBM. */
-	memset(cookie, 0, sizeof(*cookie));
 	cookie->pbm = pbm;
 
 	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno,
@@ -2120,27 +2119,24 @@ static void __schizo_init(int node, char *model_name, int chip_type)
 		}
 	}
 
-	p = kmalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
+	p = kzalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
 	if (!p) {
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_halt();
 	}
-	memset(p, 0, sizeof(*p));
 
-	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
 	if (!iommu) {
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_halt();
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_A.iommu = iommu;
 
-	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	iommu = kzalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
 	if (!iommu) {
 		prom_printf("SCHIZO: Fatal memory allocation error.\n");
 		prom_halt();
 	}
-	memset(iommu, 0, sizeof(*iommu));
 	p->pbm_B.iommu = iommu;
 
 	p->next = pci_controller_root;
diff --git a/arch/sparc64/kernel/pci_sun4v.c b/arch/sparc64/kernel/pci_sun4v.c
new file mode 100644
index 000000000000..9372d4f376d5
--- /dev/null
+++ b/arch/sparc64/kernel/pci_sun4v.c
@@ -0,0 +1,1147 @@
+/* pci_sun4v.c: SUN4V specific PCI controller support.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+
+#include <asm/pbm.h>
+#include <asm/iommu.h>
+#include <asm/irq.h>
+#include <asm/upa.h>
+#include <asm/pstate.h>
+#include <asm/oplib.h>
+#include <asm/hypervisor.h>
+
+#include "pci_impl.h"
+#include "iommu_common.h"
+
+#include "pci_sun4v.h"
+
+#define PGLIST_NENTS	(PAGE_SIZE / sizeof(u64))
+
+struct pci_iommu_batch {
+	struct pci_dev	*pdev;		/* Device mapping is for.	*/
+	unsigned long	prot;		/* IOMMU page protections	*/
+	unsigned long	entry;		/* Index into IOTSB.		*/
+	u64		*pglist;	/* List of physical pages	*/
+	unsigned long	npages;		/* Number of pages in list.	*/
+};
+
+static DEFINE_PER_CPU(struct pci_iommu_batch, pci_iommu_batch);
+
+/* Interrupts must be disabled.  */
+static inline void pci_iommu_batch_start(struct pci_dev *pdev, unsigned long prot, unsigned long entry)
+{
+	struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
+
+	p->pdev		= pdev;
+	p->prot		= prot;
+	p->entry	= entry;
+	p->npages	= 0;
+}
+
+/* Interrupts must be disabled.  */
+static long pci_iommu_batch_flush(struct pci_iommu_batch *p)
+{
+	struct pcidev_cookie *pcp = p->pdev->sysdata;
+	unsigned long devhandle = pcp->pbm->devhandle;
+	unsigned long prot = p->prot;
+	unsigned long entry = p->entry;
+	u64 *pglist = p->pglist;
+	unsigned long npages = p->npages;
+
+	while (npages != 0) {
+		long num;
+
+		num = pci_sun4v_iommu_map(devhandle, HV_PCI_TSBID(0, entry),
+					  npages, prot, __pa(pglist));
+		if (unlikely(num < 0)) {
+			if (printk_ratelimit())
+				printk("pci_iommu_batch_flush: IOMMU map of "
+				       "[%08lx:%08lx:%lx:%lx:%lx] failed with "
+				       "status %ld\n",
+				       devhandle, HV_PCI_TSBID(0, entry),
+				       npages, prot, __pa(pglist), num);
+			return -1;
+		}
+
+		entry += num;
+		npages -= num;
+		pglist += num;
+	}
+
+	p->entry = entry;
+	p->npages = 0;
+
+	return 0;
+}
+
+/* Interrupts must be disabled.  */
+static inline long pci_iommu_batch_add(u64 phys_page)
+{
+	struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
+
+	BUG_ON(p->npages >= PGLIST_NENTS);
+
+	p->pglist[p->npages++] = phys_page;
+	if (p->npages == PGLIST_NENTS)
+		return pci_iommu_batch_flush(p);
+
+	return 0;
+}
+
+/* Interrupts must be disabled.  */
+static inline long pci_iommu_batch_end(void)
+{
+	struct pci_iommu_batch *p = &__get_cpu_var(pci_iommu_batch);
+
+	BUG_ON(p->npages >= PGLIST_NENTS);
+
+	return pci_iommu_batch_flush(p);
+}
+
+static long pci_arena_alloc(struct pci_iommu_arena *arena, unsigned long npages)
+{
+	unsigned long n, i, start, end, limit;
+	int pass;
+
+	limit = arena->limit;
+	start = arena->hint;
+	pass = 0;
+
+again:
+	n = find_next_zero_bit(arena->map, limit, start);
+	end = n + npages;
+	if (unlikely(end >= limit)) {
+		if (likely(pass < 1)) {
+			limit = start;
+			start = 0;
+			pass++;
+			goto again;
+		} else {
+			/* Scanned the whole thing, give up. */
+			return -1;
+		}
+	}
+
+	for (i = n; i < end; i++) {
+		if (test_bit(i, arena->map)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	for (i = n; i < end; i++)
+		__set_bit(i, arena->map);
+
+	arena->hint = end;
+
+	return n;
+}
+
+static void pci_arena_free(struct pci_iommu_arena *arena, unsigned long base, unsigned long npages)
+{
+	unsigned long i;
+
+	for (i = base; i < (base + npages); i++)
+		__clear_bit(i, arena->map);
+}
+
+static void *pci_4v_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_addrp)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, order, first_page, npages, n;
+	void *ret;
+	long entry;
+
+	size = IO_PAGE_ALIGN(size);
+	order = get_order(size);
+	if (unlikely(order >= MAX_ORDER))
+		return NULL;
+
+	npages = size >> IO_PAGE_SHIFT;
+
+	first_page = __get_free_pages(GFP_ATOMIC, order);
+	if (unlikely(first_page == 0UL))
+		return NULL;
+
+	memset((char *)first_page, 0, PAGE_SIZE << order);
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = pci_arena_alloc(&iommu->arena, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (unlikely(entry < 0L))
+		goto arena_alloc_fail;
+
+	*dma_addrp = (iommu->page_table_map_base +
+		      (entry << IO_PAGE_SHIFT));
+	ret = (void *) first_page;
+	first_page = __pa(first_page);
+
+	local_irq_save(flags);
+
+	pci_iommu_batch_start(pdev,
+			      (HV_PCI_MAP_ATTR_READ |
+			       HV_PCI_MAP_ATTR_WRITE),
+			      entry);
+
+	for (n = 0; n < npages; n++) {
+		long err = pci_iommu_batch_add(first_page + (n * PAGE_SIZE));
+		if (unlikely(err < 0L))
+			goto iommu_map_fail;
+	}
+
+	if (unlikely(pci_iommu_batch_end() < 0L))
+		goto iommu_map_fail;
+
+	local_irq_restore(flags);
+
+	return ret;
+
+iommu_map_fail:
+	/* Interrupts are disabled.  */
+	spin_lock(&iommu->lock);
+	pci_arena_free(&iommu->arena, entry, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+arena_alloc_fail:
+	free_pages(first_page, order);
+	return NULL;
+}
+
+static void pci_4v_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_t dvma)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, order, npages, entry;
+	u32 devhandle;
+
+	npages = IO_PAGE_ALIGN(size) >> IO_PAGE_SHIFT;
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	devhandle = pcp->pbm->devhandle;
+	entry = ((dvma - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	pci_arena_free(&iommu->arena, entry, npages);
+
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	order = get_order(size);
+	if (order < 10)
+		free_pages((unsigned long)cpu, order);
+}
+
+static dma_addr_t pci_4v_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, npages, oaddr;
+	unsigned long i, base_paddr;
+	u32 bus_addr, ret;
+	unsigned long prot;
+	long entry;
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+
+	if (unlikely(direction == PCI_DMA_NONE))
+		goto bad;
+
+	oaddr = (unsigned long)ptr;
+	npages = IO_PAGE_ALIGN(oaddr + sz) - (oaddr & IO_PAGE_MASK);
+	npages >>= IO_PAGE_SHIFT;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = pci_arena_alloc(&iommu->arena, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (unlikely(entry < 0L))
+		goto bad;
+
+	bus_addr = (iommu->page_table_map_base +
+		    (entry << IO_PAGE_SHIFT));
+	ret = bus_addr | (oaddr & ~IO_PAGE_MASK);
+	base_paddr = __pa(oaddr & IO_PAGE_MASK);
+	prot = HV_PCI_MAP_ATTR_READ;
+	if (direction != PCI_DMA_TODEVICE)
+		prot |= HV_PCI_MAP_ATTR_WRITE;
+
+	local_irq_save(flags);
+
+	pci_iommu_batch_start(pdev, prot, entry);
+
+	for (i = 0; i < npages; i++, base_paddr += IO_PAGE_SIZE) {
+		long err = pci_iommu_batch_add(base_paddr);
+		if (unlikely(err < 0L))
+			goto iommu_map_fail;
+	}
+	if (unlikely(pci_iommu_batch_end() < 0L))
+		goto iommu_map_fail;
+
+	local_irq_restore(flags);
+
+	return ret;
+
+bad:
+	if (printk_ratelimit())
+		WARN_ON(1);
+	return PCI_DMA_ERROR_CODE;
+
+iommu_map_fail:
+	/* Interrupts are disabled.  */
+	spin_lock(&iommu->lock);
+	pci_arena_free(&iommu->arena, entry, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return PCI_DMA_ERROR_CODE;
+}
+
+static void pci_4v_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, npages;
+	long entry;
+	u32 devhandle;
+
+	if (unlikely(direction == PCI_DMA_NONE)) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+		return;
+	}
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	devhandle = pcp->pbm->devhandle;
+
+	npages = IO_PAGE_ALIGN(bus_addr + sz) - (bus_addr & IO_PAGE_MASK);
+	npages >>= IO_PAGE_SHIFT;
+	bus_addr &= IO_PAGE_MASK;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	entry = (bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT;
+	pci_arena_free(&iommu->arena, entry, npages);
+
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+#define SG_ENT_PHYS_ADDRESS(SG)	\
+	(__pa(page_address((SG)->page)) + (SG)->offset)
+
+static inline long fill_sg(long entry, struct pci_dev *pdev,
+			   struct scatterlist *sg,
+			   int nused, int nelems, unsigned long prot)
+{
+	struct scatterlist *dma_sg = sg;
+	struct scatterlist *sg_end = sg + nelems;
+	unsigned long flags;
+	int i;
+
+	local_irq_save(flags);
+
+	pci_iommu_batch_start(pdev, prot, entry);
+
+	for (i = 0; i < nused; i++) {
+		unsigned long pteval = ~0UL;
+		u32 dma_npages;
+
+		dma_npages = ((dma_sg->dma_address & (IO_PAGE_SIZE - 1UL)) +
+			      dma_sg->dma_length +
+			      ((IO_PAGE_SIZE - 1UL))) >> IO_PAGE_SHIFT;
+		do {
+			unsigned long offset;
+			signed int len;
+
+			/* If we are here, we know we have at least one
+			 * more page to map.  So walk forward until we
+			 * hit a page crossing, and begin creating new
+			 * mappings from that spot.
+			 */
+			for (;;) {
+				unsigned long tmp;
+
+				tmp = SG_ENT_PHYS_ADDRESS(sg);
+				len = sg->length;
+				if (((tmp ^ pteval) >> IO_PAGE_SHIFT) != 0UL) {
+					pteval = tmp & IO_PAGE_MASK;
+					offset = tmp & (IO_PAGE_SIZE - 1UL);
+					break;
+				}
+				if (((tmp ^ (tmp + len - 1UL)) >> IO_PAGE_SHIFT) != 0UL) {
+					pteval = (tmp + IO_PAGE_SIZE) & IO_PAGE_MASK;
+					offset = 0UL;
+					len -= (IO_PAGE_SIZE - (tmp & (IO_PAGE_SIZE - 1UL)));
+					break;
+				}
+				sg++;
+			}
+
+			pteval = (pteval & IOPTE_PAGE);
+			while (len > 0) {
+				long err;
+
+				err = pci_iommu_batch_add(pteval);
+				if (unlikely(err < 0L))
+					goto iommu_map_failed;
+
+				pteval += IO_PAGE_SIZE;
+				len -= (IO_PAGE_SIZE - offset);
+				offset = 0;
+				dma_npages--;
+			}
+
+			pteval = (pteval & IOPTE_PAGE) + len;
+			sg++;
+
+			/* Skip over any tail mappings we've fully mapped,
+			 * adjusting pteval along the way.  Stop when we
+			 * detect a page crossing event.
+			 */
+			while (sg < sg_end &&
+			       (pteval << (64 - IO_PAGE_SHIFT)) != 0UL &&
+			       (pteval == SG_ENT_PHYS_ADDRESS(sg)) &&
+			       ((pteval ^
+				 (SG_ENT_PHYS_ADDRESS(sg) + sg->length - 1UL)) >> IO_PAGE_SHIFT) == 0UL) {
+				pteval += sg->length;
+				sg++;
+			}
+			if ((pteval << (64 - IO_PAGE_SHIFT)) == 0UL)
+				pteval = ~0UL;
+		} while (dma_npages != 0);
+		dma_sg++;
+	}
+
+	if (unlikely(pci_iommu_batch_end() < 0L))
+		goto iommu_map_failed;
+
+	local_irq_restore(flags);
+	return 0;
+
+iommu_map_failed:
+	local_irq_restore(flags);
+	return -1L;
+}
+
+static int pci_4v_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, npages, prot;
+	u32 dma_base;
+	struct scatterlist *sgtmp;
+	long entry, err;
+	int used;
+
+	/* Fast path single entry scatterlists. */
+	if (nelems == 1) {
+		sglist->dma_address =
+			pci_4v_map_single(pdev,
+					  (page_address(sglist->page) + sglist->offset),
+					  sglist->length, direction);
+		if (unlikely(sglist->dma_address == PCI_DMA_ERROR_CODE))
+			return 0;
+		sglist->dma_length = sglist->length;
+		return 1;
+	}
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	
+	if (unlikely(direction == PCI_DMA_NONE))
+		goto bad;
+
+	/* Step 1: Prepare scatter list. */
+	npages = prepare_sg(sglist, nelems);
+
+	/* Step 2: Allocate a cluster and context, if necessary. */
+	spin_lock_irqsave(&iommu->lock, flags);
+	entry = pci_arena_alloc(&iommu->arena, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (unlikely(entry < 0L))
+		goto bad;
+
+	dma_base = iommu->page_table_map_base +
+		(entry << IO_PAGE_SHIFT);
+
+	/* Step 3: Normalize DMA addresses. */
+	used = nelems;
+
+	sgtmp = sglist;
+	while (used && sgtmp->dma_length) {
+		sgtmp->dma_address += dma_base;
+		sgtmp++;
+		used--;
+	}
+	used = nelems - used;
+
+	/* Step 4: Create the mappings. */
+	prot = HV_PCI_MAP_ATTR_READ;
+	if (direction != PCI_DMA_TODEVICE)
+		prot |= HV_PCI_MAP_ATTR_WRITE;
+
+	err = fill_sg(entry, pdev, sglist, used, nelems, prot);
+	if (unlikely(err < 0L))
+		goto iommu_map_failed;
+
+	return used;
+
+bad:
+	if (printk_ratelimit())
+		WARN_ON(1);
+	return 0;
+
+iommu_map_failed:
+	spin_lock_irqsave(&iommu->lock, flags);
+	pci_arena_free(&iommu->arena, entry, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	return 0;
+}
+
+static void pci_4v_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+{
+	struct pcidev_cookie *pcp;
+	struct pci_iommu *iommu;
+	unsigned long flags, i, npages;
+	long entry;
+	u32 devhandle, bus_addr;
+
+	if (unlikely(direction == PCI_DMA_NONE)) {
+		if (printk_ratelimit())
+			WARN_ON(1);
+	}
+
+	pcp = pdev->sysdata;
+	iommu = pcp->pbm->iommu;
+	devhandle = pcp->pbm->devhandle;
+	
+	bus_addr = sglist->dma_address & IO_PAGE_MASK;
+
+	for (i = 1; i < nelems; i++)
+		if (sglist[i].dma_length == 0)
+			break;
+	i--;
+	npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) -
+		  bus_addr) >> IO_PAGE_SHIFT;
+
+	entry = ((bus_addr - iommu->page_table_map_base) >> IO_PAGE_SHIFT);
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	pci_arena_free(&iommu->arena, entry, npages);
+
+	do {
+		unsigned long num;
+
+		num = pci_sun4v_iommu_demap(devhandle, HV_PCI_TSBID(0, entry),
+					    npages);
+		entry += num;
+		npages -= num;
+	} while (npages != 0);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+static void pci_4v_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int direction)
+{
+	/* Nothing to do... */
+}
+
+static void pci_4v_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int direction)
+{
+	/* Nothing to do... */
+}
+
+struct pci_iommu_ops pci_sun4v_iommu_ops = {
+	.alloc_consistent		= pci_4v_alloc_consistent,
+	.free_consistent		= pci_4v_free_consistent,
+	.map_single			= pci_4v_map_single,
+	.unmap_single			= pci_4v_unmap_single,
+	.map_sg				= pci_4v_map_sg,
+	.unmap_sg			= pci_4v_unmap_sg,
+	.dma_sync_single_for_cpu	= pci_4v_dma_sync_single_for_cpu,
+	.dma_sync_sg_for_cpu		= pci_4v_dma_sync_sg_for_cpu,
+};
+
+/* SUN4V PCI configuration space accessors. */
+
+static inline int pci_sun4v_out_of_range(struct pci_pbm_info *pbm, unsigned int bus, unsigned int device, unsigned int func)
+{
+	if (bus == pbm->pci_first_busno) {
+		if (device == 0 && func == 0)
+			return 0;
+		return 1;
+	}
+
+	if (bus < pbm->pci_first_busno ||
+	    bus > pbm->pci_last_busno)
+		return 1;
+	return 0;
+}
+
+static int pci_sun4v_read_pci_cfg(struct pci_bus *bus_dev, unsigned int devfn,
+				  int where, int size, u32 *value)
+{
+	struct pci_pbm_info *pbm = bus_dev->sysdata;
+	u32 devhandle = pbm->devhandle;
+	unsigned int bus = bus_dev->number;
+	unsigned int device = PCI_SLOT(devfn);
+	unsigned int func = PCI_FUNC(devfn);
+	unsigned long ret;
+
+	if (pci_sun4v_out_of_range(pbm, bus, device, func)) {
+		ret = ~0UL;
+	} else {
+		ret = pci_sun4v_config_get(devhandle,
+				HV_PCI_DEVICE_BUILD(bus, device, func),
+				where, size);
+#if 0
+		printk("rcfg: [%x:%x:%x:%d]=[%lx]\n",
+		       devhandle, HV_PCI_DEVICE_BUILD(bus, device, func),
+		       where, size, ret);
+#endif
+	}
+	switch (size) {
+	case 1:
+		*value = ret & 0xff;
+		break;
+	case 2:
+		*value = ret & 0xffff;
+		break;
+	case 4:
+		*value = ret & 0xffffffff;
+		break;
+	};
+
+
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static int pci_sun4v_write_pci_cfg(struct pci_bus *bus_dev, unsigned int devfn,
+				   int where, int size, u32 value)
+{
+	struct pci_pbm_info *pbm = bus_dev->sysdata;
+	u32 devhandle = pbm->devhandle;
+	unsigned int bus = bus_dev->number;
+	unsigned int device = PCI_SLOT(devfn);
+	unsigned int func = PCI_FUNC(devfn);
+	unsigned long ret;
+
+	if (pci_sun4v_out_of_range(pbm, bus, device, func)) {
+		/* Do nothing. */
+	} else {
+		ret = pci_sun4v_config_put(devhandle,
+				HV_PCI_DEVICE_BUILD(bus, device, func),
+				where, size, value);
+#if 0
+		printk("wcfg: [%x:%x:%x:%d] v[%x] == [%lx]\n",
+		       devhandle, HV_PCI_DEVICE_BUILD(bus, device, func),
+		       where, size, value, ret);
+#endif
+	}
+	return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops pci_sun4v_ops = {
+	.read =		pci_sun4v_read_pci_cfg,
+	.write =	pci_sun4v_write_pci_cfg,
+};
+
+
+static void pbm_scan_bus(struct pci_controller_info *p,
+			 struct pci_pbm_info *pbm)
+{
+	struct pcidev_cookie *cookie = kmalloc(sizeof(*cookie), GFP_KERNEL);
+
+	if (!cookie) {
+		prom_printf("%s: Critical allocation failure.\n", pbm->name);
+		prom_halt();
+	}
+
+	/* All we care about is the PBM. */
+	memset(cookie, 0, sizeof(*cookie));
+	cookie->pbm = pbm;
+
+	pbm->pci_bus = pci_scan_bus(pbm->pci_first_busno, p->pci_ops, pbm);
+#if 0
+	pci_fixup_host_bridge_self(pbm->pci_bus);
+	pbm->pci_bus->self->sysdata = cookie;
+#endif
+	pci_fill_in_pbm_cookies(pbm->pci_bus, pbm,
+				pbm->prom_node);
+	pci_record_assignments(pbm, pbm->pci_bus);
+	pci_assign_unassigned(pbm, pbm->pci_bus);
+	pci_fixup_irq(pbm, pbm->pci_bus);
+	pci_determine_66mhz_disposition(pbm, pbm->pci_bus);
+	pci_setup_busmastering(pbm, pbm->pci_bus);
+}
+
+static void pci_sun4v_scan_bus(struct pci_controller_info *p)
+{
+	if (p->pbm_A.prom_node) {
+		p->pbm_A.is_66mhz_capable =
+			prom_getbool(p->pbm_A.prom_node, "66mhz-capable");
+
+		pbm_scan_bus(p, &p->pbm_A);
+	}
+	if (p->pbm_B.prom_node) {
+		p->pbm_B.is_66mhz_capable =
+			prom_getbool(p->pbm_B.prom_node, "66mhz-capable");
+
+		pbm_scan_bus(p, &p->pbm_B);
+	}
+
+	/* XXX register error interrupt handlers XXX */
+}
+
+static unsigned int pci_sun4v_irq_build(struct pci_pbm_info *pbm,
+					struct pci_dev *pdev,
+					unsigned int devino)
+{
+	u32 devhandle = pbm->devhandle;
+	int pil;
+
+	pil = 5;
+	if (pdev) {
+		switch ((pdev->class >> 16) & 0xff) {
+		case PCI_BASE_CLASS_STORAGE:
+			pil = 5;
+			break;
+
+		case PCI_BASE_CLASS_NETWORK:
+			pil = 6;
+			break;
+
+		case PCI_BASE_CLASS_DISPLAY:
+			pil = 9;
+			break;
+
+		case PCI_BASE_CLASS_MULTIMEDIA:
+		case PCI_BASE_CLASS_MEMORY:
+		case PCI_BASE_CLASS_BRIDGE:
+		case PCI_BASE_CLASS_SERIAL:
+			pil = 10;
+			break;
+
+		default:
+			pil = 5;
+			break;
+		};
+	}
+	BUG_ON(PIL_RESERVED(pil));
+
+	return sun4v_build_irq(devhandle, devino, pil, IBF_PCI);
+}
+
+static void pci_sun4v_base_address_update(struct pci_dev *pdev, int resource)
+{
+	struct pcidev_cookie *pcp = pdev->sysdata;
+	struct pci_pbm_info *pbm = pcp->pbm;
+	struct resource *res, *root;
+	u32 reg;
+	int where, size, is_64bit;
+
+	res = &pdev->resource[resource];
+	if (resource < 6) {
+		where = PCI_BASE_ADDRESS_0 + (resource * 4);
+	} else if (resource == PCI_ROM_RESOURCE) {
+		where = pdev->rom_base_reg;
+	} else {
+		/* Somebody might have asked allocation of a non-standard resource */
+		return;
+	}
+
+	/* XXX 64-bit MEM handling is not %100 correct... XXX */
+	is_64bit = 0;
+	if (res->flags & IORESOURCE_IO)
+		root = &pbm->io_space;
+	else {
+		root = &pbm->mem_space;
+		if ((res->flags & PCI_BASE_ADDRESS_MEM_TYPE_MASK)
+		    == PCI_BASE_ADDRESS_MEM_TYPE_64)
+			is_64bit = 1;
+	}
+
+	size = res->end - res->start;
+	pci_read_config_dword(pdev, where, &reg);
+	reg = ((reg & size) |
+	       (((u32)(res->start - root->start)) & ~size));
+	if (resource == PCI_ROM_RESOURCE) {
+		reg |= PCI_ROM_ADDRESS_ENABLE;
+		res->flags |= IORESOURCE_ROM_ENABLE;
+	}
+	pci_write_config_dword(pdev, where, reg);
+
+	/* This knows that the upper 32-bits of the address
+	 * must be zero.  Our PCI common layer enforces this.
+	 */
+	if (is_64bit)
+		pci_write_config_dword(pdev, where + 4, 0);
+}
+
+static void pci_sun4v_resource_adjust(struct pci_dev *pdev,
+				      struct resource *res,
+				      struct resource *root)
+{
+	res->start += root->start;
+	res->end += root->start;
+}
+
+/* Use ranges property to determine where PCI MEM, I/O, and Config
+ * space are for this PCI bus module.
+ */
+static void pci_sun4v_determine_mem_io_space(struct pci_pbm_info *pbm)
+{
+	int i, saw_mem, saw_io;
+
+	saw_mem = saw_io = 0;
+	for (i = 0; i < pbm->num_pbm_ranges; i++) {
+		struct linux_prom_pci_ranges *pr = &pbm->pbm_ranges[i];
+		unsigned long a;
+		int type;
+
+		type = (pr->child_phys_hi >> 24) & 0x3;
+		a = (((unsigned long)pr->parent_phys_hi << 32UL) |
+		     ((unsigned long)pr->parent_phys_lo  <<  0UL));
+
+		switch (type) {
+		case 1:
+			/* 16-bit IO space, 16MB */
+			pbm->io_space.start = a;
+			pbm->io_space.end = a + ((16UL*1024UL*1024UL) - 1UL);
+			pbm->io_space.flags = IORESOURCE_IO;
+			saw_io = 1;
+			break;
+
+		case 2:
+			/* 32-bit MEM space, 2GB */
+			pbm->mem_space.start = a;
+			pbm->mem_space.end = a + (0x80000000UL - 1UL);
+			pbm->mem_space.flags = IORESOURCE_MEM;
+			saw_mem = 1;
+			break;
+
+		case 3:
+			/* XXX 64-bit MEM handling XXX */
+
+		default:
+			break;
+		};
+	}
+
+	if (!saw_io || !saw_mem) {
+		prom_printf("%s: Fatal error, missing %s PBM range.\n",
+			    pbm->name,
+			    (!saw_io ? "IO" : "MEM"));
+		prom_halt();
+	}
+
+	printk("%s: PCI IO[%lx] MEM[%lx]\n",
+	       pbm->name,
+	       pbm->io_space.start,
+	       pbm->mem_space.start);
+}
+
+static void pbm_register_toplevel_resources(struct pci_controller_info *p,
+					    struct pci_pbm_info *pbm)
+{
+	pbm->io_space.name = pbm->mem_space.name = pbm->name;
+
+	request_resource(&ioport_resource, &pbm->io_space);
+	request_resource(&iomem_resource, &pbm->mem_space);
+	pci_register_legacy_regions(&pbm->io_space,
+				    &pbm->mem_space);
+}
+
+static unsigned long probe_existing_entries(struct pci_pbm_info *pbm,
+					    struct pci_iommu *iommu)
+{
+	struct pci_iommu_arena *arena = &iommu->arena;
+	unsigned long i, cnt = 0;
+	u32 devhandle;
+
+	devhandle = pbm->devhandle;
+	for (i = 0; i < arena->limit; i++) {
+		unsigned long ret, io_attrs, ra;
+
+		ret = pci_sun4v_iommu_getmap(devhandle,
+					     HV_PCI_TSBID(0, i),
+					     &io_attrs, &ra);
+		if (ret == HV_EOK) {
+			cnt++;
+			__set_bit(i, arena->map);
+		}
+	}
+
+	return cnt;
+}
+
+static void pci_sun4v_iommu_init(struct pci_pbm_info *pbm)
+{
+	struct pci_iommu *iommu = pbm->iommu;
+	unsigned long num_tsb_entries, sz;
+	u32 vdma[2], dma_mask, dma_offset;
+	int err, tsbsize;
+
+	err = prom_getproperty(pbm->prom_node, "virtual-dma",
+			       (char *)&vdma[0], sizeof(vdma));
+	if (err == 0 || err == -1) {
+		/* No property, use default values. */
+		vdma[0] = 0x80000000;
+		vdma[1] = 0x80000000;
+	}
+
+	dma_mask = vdma[0];
+	switch (vdma[1]) {
+		case 0x20000000:
+			dma_mask |= 0x1fffffff;
+			tsbsize = 64;
+			break;
+
+		case 0x40000000:
+			dma_mask |= 0x3fffffff;
+			tsbsize = 128;
+			break;
+
+		case 0x80000000:
+			dma_mask |= 0x7fffffff;
+			tsbsize = 256;
+			break;
+
+		default:
+			prom_printf("PCI-SUN4V: strange virtual-dma size.\n");
+			prom_halt();
+	};
+
+	tsbsize *= (8 * 1024);
+
+	num_tsb_entries = tsbsize / sizeof(iopte_t);
+
+	dma_offset = vdma[0];
+
+	/* Setup initial software IOMMU state. */
+	spin_lock_init(&iommu->lock);
+	iommu->ctx_lowest_free = 1;
+	iommu->page_table_map_base = dma_offset;
+	iommu->dma_addr_mask = dma_mask;
+
+	/* Allocate and initialize the free area map.  */
+	sz = num_tsb_entries / 8;
+	sz = (sz + 7UL) & ~7UL;
+	iommu->arena.map = kmalloc(sz, GFP_KERNEL);
+	if (!iommu->arena.map) {
+		prom_printf("PCI_IOMMU: Error, kmalloc(arena.map) failed.\n");
+		prom_halt();
+	}
+	memset(iommu->arena.map, 0, sz);
+	iommu->arena.limit = num_tsb_entries;
+
+	sz = probe_existing_entries(pbm, iommu);
+
+	printk("%s: TSB entries [%lu], existing mapings [%lu]\n",
+	       pbm->name, num_tsb_entries, sz);
+}
+
+static void pci_sun4v_get_bus_range(struct pci_pbm_info *pbm)
+{
+	unsigned int busrange[2];
+	int prom_node = pbm->prom_node;
+	int err;
+
+	err = prom_getproperty(prom_node, "bus-range",
+			       (char *)&busrange[0],
+			       sizeof(busrange));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no bus-range.\n", pbm->name);
+		prom_halt();
+	}
+
+	pbm->pci_first_busno = busrange[0];
+	pbm->pci_last_busno = busrange[1];
+
+}
+
+static void pci_sun4v_pbm_init(struct pci_controller_info *p, int prom_node, u32 devhandle)
+{
+	struct pci_pbm_info *pbm;
+	int err, i;
+
+	if (devhandle & 0x40)
+		pbm = &p->pbm_B;
+	else
+		pbm = &p->pbm_A;
+
+	pbm->parent = p;
+	pbm->prom_node = prom_node;
+	pbm->pci_first_slot = 1;
+
+	pbm->devhandle = devhandle;
+
+	sprintf(pbm->name, "SUN4V-PCI%d PBM%c",
+		p->index, (pbm == &p->pbm_A ? 'A' : 'B'));
+
+	printk("%s: devhandle[%x] prom_node[%x:%x]\n",
+	       pbm->name, pbm->devhandle,
+	       pbm->prom_node, prom_getchild(pbm->prom_node));
+
+	prom_getstring(prom_node, "name",
+		       pbm->prom_name, sizeof(pbm->prom_name));
+
+	err = prom_getproperty(prom_node, "ranges",
+			       (char *) pbm->pbm_ranges,
+			       sizeof(pbm->pbm_ranges));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no ranges property.\n",
+			    pbm->name);
+		prom_halt();
+	}
+
+	pbm->num_pbm_ranges =
+		(err / sizeof(struct linux_prom_pci_ranges));
+
+	/* Mask out the top 8 bits of the ranges, leaving the real
+	 * physical address.
+	 */
+	for (i = 0; i < pbm->num_pbm_ranges; i++)
+		pbm->pbm_ranges[i].parent_phys_hi &= 0x0fffffff;
+
+	pci_sun4v_determine_mem_io_space(pbm);
+	pbm_register_toplevel_resources(p, pbm);
+
+	err = prom_getproperty(prom_node, "interrupt-map",
+			       (char *)pbm->pbm_intmap,
+			       sizeof(pbm->pbm_intmap));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no interrupt-map property.\n",
+			    pbm->name);
+		prom_halt();
+	}
+
+	pbm->num_pbm_intmap = (err / sizeof(struct linux_prom_pci_intmap));
+	err = prom_getproperty(prom_node, "interrupt-map-mask",
+			       (char *)&pbm->pbm_intmask,
+			       sizeof(pbm->pbm_intmask));
+	if (err == 0 || err == -1) {
+		prom_printf("%s: Fatal error, no interrupt-map-mask.\n",
+			    pbm->name);
+		prom_halt();
+	}
+
+	pci_sun4v_get_bus_range(pbm);
+	pci_sun4v_iommu_init(pbm);
+}
+
+void sun4v_pci_init(int node, char *model_name)
+{
+	struct pci_controller_info *p;
+	struct pci_iommu *iommu;
+	struct linux_prom64_registers regs;
+	u32 devhandle;
+	int i;
+
+	prom_getproperty(node, "reg", (char *)&regs, sizeof(regs));
+	devhandle = (regs.phys_addr >> 32UL) & 0x0fffffff;
+
+	for (p = pci_controller_root; p; p = p->next) {
+		struct pci_pbm_info *pbm;
+
+		if (p->pbm_A.prom_node && p->pbm_B.prom_node)
+			continue;
+
+		pbm = (p->pbm_A.prom_node ?
+		       &p->pbm_A :
+		       &p->pbm_B);
+
+		if (pbm->devhandle == (devhandle ^ 0x40)) {
+			pci_sun4v_pbm_init(p, node, devhandle);
+			return;
+		}
+	}
+
+	for_each_cpu(i) {
+		unsigned long page = get_zeroed_page(GFP_ATOMIC);
+
+		if (!page)
+			goto fatal_memory_error;
+
+		per_cpu(pci_iommu_batch, i).pglist = (u64 *) page;
+	}
+
+	p = kmalloc(sizeof(struct pci_controller_info), GFP_ATOMIC);
+	if (!p)
+		goto fatal_memory_error;
+
+	memset(p, 0, sizeof(*p));
+
+	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	if (!iommu)
+		goto fatal_memory_error;
+
+	memset(iommu, 0, sizeof(*iommu));
+	p->pbm_A.iommu = iommu;
+
+	iommu = kmalloc(sizeof(struct pci_iommu), GFP_ATOMIC);
+	if (!iommu)
+		goto fatal_memory_error;
+
+	memset(iommu, 0, sizeof(*iommu));
+	p->pbm_B.iommu = iommu;
+
+	p->next = pci_controller_root;
+	pci_controller_root = p;
+
+	p->index = pci_num_controllers++;
+	p->pbms_same_domain = 0;
+
+	p->scan_bus = pci_sun4v_scan_bus;
+	p->irq_build = pci_sun4v_irq_build;
+	p->base_address_update = pci_sun4v_base_address_update;
+	p->resource_adjust = pci_sun4v_resource_adjust;
+	p->pci_ops = &pci_sun4v_ops;
+
+	/* Like PSYCHO and SCHIZO we have a 2GB aligned area
+	 * for memory space.
+	 */
+	pci_memspace_mask = 0x7fffffffUL;
+
+	pci_sun4v_pbm_init(p, node, devhandle);
+	return;
+
+fatal_memory_error:
+	prom_printf("SUN4V_PCI: Fatal memory allocation error.\n");
+	prom_halt();
+}
diff --git a/arch/sparc64/kernel/pci_sun4v.h b/arch/sparc64/kernel/pci_sun4v.h
new file mode 100644
index 000000000000..884d25f6158d
--- /dev/null
+++ b/arch/sparc64/kernel/pci_sun4v.h
@@ -0,0 +1,31 @@
+/* pci_sun4v.h: SUN4V specific PCI controller support.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#ifndef _PCI_SUN4V_H
+#define _PCI_SUN4V_H
+
+extern long pci_sun4v_iommu_map(unsigned long devhandle,
+				unsigned long tsbid,
+				unsigned long num_ttes,
+				unsigned long io_attributes,
+				unsigned long io_page_list_pa);
+extern unsigned long pci_sun4v_iommu_demap(unsigned long devhandle,
+					   unsigned long tsbid,
+					   unsigned long num_ttes);
+extern unsigned long pci_sun4v_iommu_getmap(unsigned long devhandle,
+					    unsigned long tsbid,
+					    unsigned long *io_attributes,
+					    unsigned long *real_address);
+extern unsigned long pci_sun4v_config_get(unsigned long devhandle,
+					  unsigned long pci_device,
+					  unsigned long config_offset,
+					  unsigned long size);
+extern int pci_sun4v_config_put(unsigned long devhandle,
+				unsigned long pci_device,
+				unsigned long config_offset,
+				unsigned long size,
+				unsigned long data);
+
+#endif /* !(_PCI_SUN4V_H) */
diff --git a/arch/sparc64/kernel/pci_sun4v_asm.S b/arch/sparc64/kernel/pci_sun4v_asm.S
new file mode 100644
index 000000000000..6604fdbf746c
--- /dev/null
+++ b/arch/sparc64/kernel/pci_sun4v_asm.S
@@ -0,0 +1,95 @@
+/* pci_sun4v_asm: Hypervisor calls for PCI support.
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <asm/hypervisor.h>
+
+	/* %o0: devhandle
+	 * %o1:	tsbid
+	 * %o2:	num ttes
+	 * %o3:	io_attributes
+	 * %o4:	io_page_list phys address
+	 *
+	 * returns %o0:	-status if status was non-zero, else
+	 *         %o0:	num pages mapped
+	 */
+	.globl	pci_sun4v_iommu_map
+pci_sun4v_iommu_map:
+	mov	%o5, %g1
+	mov	HV_FAST_PCI_IOMMU_MAP, %o5
+	ta	HV_FAST_TRAP
+	brnz,pn %o0, 1f
+	 sub	%g0, %o0, %o0
+	mov	%o1, %o0
+1:	retl
+	 nop
+
+	/* %o0: devhandle
+	 * %o1:	tsbid
+	 * %o2:	num ttes
+	 *
+	 * returns %o0:	num ttes demapped
+	 */
+	.globl	pci_sun4v_iommu_demap
+pci_sun4v_iommu_demap:
+	mov	HV_FAST_PCI_IOMMU_DEMAP, %o5
+	ta	HV_FAST_TRAP
+	retl
+	 mov	%o1, %o0
+
+	/* %o0: devhandle
+	 * %o1:	tsbid
+	 * %o2:	&io_attributes
+	 * %o3:	&real_address
+	 *
+	 * returns %o0:	status
+	 */
+	.globl	pci_sun4v_iommu_getmap
+pci_sun4v_iommu_getmap:
+	mov	%o2, %o4
+	mov	HV_FAST_PCI_IOMMU_GETMAP, %o5
+	ta	HV_FAST_TRAP
+	stx	%o1, [%o4]
+	stx	%o2, [%o3]
+	retl
+	 mov	%o0, %o0
+
+	/* %o0: devhandle
+	 * %o1:	pci_device
+	 * %o2:	pci_config_offset
+	 * %o3:	size
+	 *
+	 * returns %o0:	data
+	 *
+	 * If there is an error, the data will be returned
+	 * as all 1's.
+	 */
+	.globl	pci_sun4v_config_get
+pci_sun4v_config_get:
+	mov	HV_FAST_PCI_CONFIG_GET, %o5
+	ta	HV_FAST_TRAP
+	brnz,a,pn %o1, 1f
+	 mov	-1, %o2
+1:	retl
+	 mov	%o2, %o0
+
+	/* %o0: devhandle
+	 * %o1:	pci_device
+	 * %o2:	pci_config_offset
+	 * %o3:	size
+	 * %o4:	data
+	 *
+	 * returns %o0:	status
+	 *
+	 * status will be zero if the operation completed
+	 * successfully, else -1 if not
+	 */
+	.globl	pci_sun4v_config_put
+pci_sun4v_config_put:
+	mov	HV_FAST_PCI_CONFIG_PUT, %o5
+	ta	HV_FAST_TRAP
+	brnz,a,pn %o1, 1f
+	 mov	-1, %o1
+1:	retl
+	 mov	%o1, %o0
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 059b0d025224..1c7ca2f712d9 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -44,83 +44,61 @@
 #include <asm/fpumacro.h>
 #include <asm/head.h>
 #include <asm/cpudata.h>
+#include <asm/mmu_context.h>
 #include <asm/unistd.h>
+#include <asm/hypervisor.h>
 
 /* #define VERBOSE_SHOWREGS */
 
-/*
- * Nothing special yet...
- */
-void default_idle(void)
-{
-}
-
-#ifndef CONFIG_SMP
-
-/*
- * the idle loop on a Sparc... ;)
- */
-void cpu_idle(void)
+static void sparc64_yield(void)
 {
-	/* endless idle loop with no priority at all */
-	for (;;) {
-		/* If current->work.need_resched is zero we should really
-		 * setup for a system wakup event and execute a shutdown
-		 * instruction.
-		 *
-		 * But this requires writing back the contents of the
-		 * L2 cache etc. so implement this later. -DaveM
-		 */
-		while (!need_resched())
-			barrier();
+	if (tlb_type != hypervisor)
+		return;
 
-		preempt_enable_no_resched();
-		schedule();
-		preempt_disable();
-		check_pgt_cache();
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+	smp_mb__after_clear_bit();
+
+	while (!need_resched()) {
+		unsigned long pstate;
+
+		/* Disable interrupts. */
+		__asm__ __volatile__(
+			"rdpr %%pstate, %0\n\t"
+			"andn %0, %1, %0\n\t"
+			"wrpr %0, %%g0, %%pstate"
+			: "=&r" (pstate)
+			: "i" (PSTATE_IE));
+
+		if (!need_resched())
+			sun4v_cpu_yield();
+
+		/* Re-enable interrupts. */
+		__asm__ __volatile__(
+			"rdpr %%pstate, %0\n\t"
+			"or %0, %1, %0\n\t"
+			"wrpr %0, %%g0, %%pstate"
+			: "=&r" (pstate)
+			: "i" (PSTATE_IE));
 	}
-}
 
-#else
+	set_thread_flag(TIF_POLLING_NRFLAG);
+}
 
-/*
- * the idle loop on a UltraMultiPenguin...
- *
- * TIF_POLLING_NRFLAG is set because we do not sleep the cpu
- * inside of the idler task, so an interrupt is not needed
- * to get a clean fast response.
- *
- * XXX Reverify this assumption... -DaveM
- *
- * Addendum: We do want it to do something for the signal
- *           delivery case, we detect that by just seeing
- *           if we are trying to send this to an idler or not.
- */
+/* The idle loop on sparc64. */
 void cpu_idle(void)
 {
-	cpuinfo_sparc *cpuinfo = &local_cpu_data();
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while(1) {
 		if (need_resched()) {
-			cpuinfo->idle_volume = 0;
 			preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
-			check_pgt_cache();
 		}
-		cpuinfo->idle_volume++;
-
-		/* The store ordering is so that IRQ handlers on
-		 * other cpus see our increasing idleness for the buddy
-		 * redistribution algorithm.  -DaveM
-		 */
-		membar_storeload_storestore();
+		sparc64_yield();
 	}
 }
 
-#endif
-
 extern char reboot_command [];
 
 extern void (*prom_palette)(int);
@@ -354,6 +332,7 @@ void show_regs(struct pt_regs *regs)
 	extern long etrap, etraptl1;
 #endif
 	__show_regs(regs);
+#if 0
 #ifdef CONFIG_SMP
 	{
 		extern void smp_report_regs(void);
@@ -361,6 +340,7 @@ void show_regs(struct pt_regs *regs)
 		smp_report_regs();
 	}
 #endif
+#endif
 
 #ifdef VERBOSE_SHOWREGS	
 	if (regs->tpc >= &etrap && regs->tpc < &etraptl1 &&
@@ -433,30 +413,15 @@ void exit_thread(void)
 void flush_thread(void)
 {
 	struct thread_info *t = current_thread_info();
+	struct mm_struct *mm;
 
 	if (t->flags & _TIF_ABI_PENDING)
 		t->flags ^= (_TIF_ABI_PENDING | _TIF_32BIT);
 
-	if (t->task->mm) {
-		unsigned long pgd_cache = 0UL;
-		if (test_thread_flag(TIF_32BIT)) {
-			struct mm_struct *mm = t->task->mm;
-			pgd_t *pgd0 = &mm->pgd[0];
-			pud_t *pud0 = pud_offset(pgd0, 0);
+	mm = t->task->mm;
+	if (mm)
+		tsb_context_switch(mm);
 
-			if (pud_none(*pud0)) {
-				pmd_t *page = pmd_alloc_one(mm, 0);
-				pud_set(pud0, page);
-			}
-			pgd_cache = get_pgd_cache(pgd0);
-		}
-		__asm__ __volatile__("stxa %0, [%1] %2\n\t"
-				     "membar #Sync"
-				     : /* no outputs */
-				     : "r" (pgd_cache),
-				     "r" (TSB_REG),
-				     "i" (ASI_DMMU));
-	}
 	set_thread_wsaved(0);
 
 	/* Turn off performance counters if on. */
@@ -555,6 +520,18 @@ void synchronize_user_stack(void)
 	}
 }
 
+static void stack_unaligned(unsigned long sp)
+{
+	siginfo_t info;
+
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = BUS_ADRALN;
+	info.si_addr = (void __user *) sp;
+	info.si_trapno = 0;
+	force_sig_info(SIGBUS, &info, current);
+}
+
 void fault_in_user_windows(void)
 {
 	struct thread_info *t = current_thread_info();
@@ -570,13 +547,17 @@ void fault_in_user_windows(void)
 	flush_user_windows();
 	window = get_thread_wsaved();
 
-	if (window != 0) {
+	if (likely(window != 0)) {
 		window -= 1;
 		do {
 			unsigned long sp = (t->rwbuf_stkptrs[window] + bias);
 			struct reg_window *rwin = &t->reg_window[window];
 
-			if (copy_to_user((char __user *)sp, rwin, winsize))
+			if (unlikely(sp & 0x7UL))
+				stack_unaligned(sp);
+
+			if (unlikely(copy_to_user((char __user *)sp,
+						  rwin, winsize)))
 				goto barf;
 		} while (window--);
 	}
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index 3f9746f856d2..eb93e9c52846 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -124,6 +124,9 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
 {
 	BUG_ON(len > PAGE_SIZE);
 
+	if (tlb_type == hypervisor)
+		return;
+
 #ifdef DCACHE_ALIASING_POSSIBLE
 	/* If bit 13 of the kernel address we used to access the
 	 * user page is the same as the virtual address that page
diff --git a/arch/sparc64/kernel/rtrap.S b/arch/sparc64/kernel/rtrap.S
index b80eba0081ca..7130e866f935 100644
--- a/arch/sparc64/kernel/rtrap.S
+++ b/arch/sparc64/kernel/rtrap.S
@@ -223,12 +223,26 @@ rt_continue:	ldx			[%sp + PTREGS_OFF + PT_V9_G1], %g1
 		ldx			[%sp + PTREGS_OFF + PT_V9_G3], %g3
 		ldx			[%sp + PTREGS_OFF + PT_V9_G4], %g4
 		ldx			[%sp + PTREGS_OFF + PT_V9_G5], %g5
-		mov			TSB_REG, %g6
-		brnz,a,pn		%l3, 1f
-		 ldxa			[%g6] ASI_IMMU, %g5
-1:		ldx			[%sp + PTREGS_OFF + PT_V9_G6], %g6
+		brz,pt			%l3, 1f
+		mov			%g6, %l2
+
+		/* Must do this before thread reg is clobbered below.  */
+		LOAD_PER_CPU_BASE(%g5, %g6, %i0, %i1, %i2)
+1:
+		ldx			[%sp + PTREGS_OFF + PT_V9_G6], %g6
 		ldx			[%sp + PTREGS_OFF + PT_V9_G7], %g7
-		wrpr			%g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
+
+		/* Normal globals are restored, go to trap globals.  */
+661:		wrpr			%g0, RTRAP_PSTATE_AG_IRQOFF, %pstate
+		nop
+		.section		.sun4v_2insn_patch, "ax"
+		.word			661b
+		wrpr			%g0, RTRAP_PSTATE_IRQOFF, %pstate
+		SET_GL(1)
+		.previous
+
+		mov			%l2, %g6
+
 		ldx			[%sp + PTREGS_OFF + PT_V9_I0], %i0
 		ldx			[%sp + PTREGS_OFF + PT_V9_I1], %i1
 
@@ -252,27 +266,108 @@ rt_continue:	ldx			[%sp + PTREGS_OFF + PT_V9_G1], %g1
 
 		brnz,pn			%l3, kern_rtt
 		 mov			PRIMARY_CONTEXT, %l7
-		ldxa			[%l7 + %l7] ASI_DMMU, %l0
+
+661:		ldxa			[%l7 + %l7] ASI_DMMU, %l0
+		.section		.sun4v_1insn_patch, "ax"
+		.word			661b
+		ldxa			[%l7 + %l7] ASI_MMU, %l0
+		.previous
+
 		sethi			%hi(sparc64_kern_pri_nuc_bits), %l1
 		ldx			[%l1 + %lo(sparc64_kern_pri_nuc_bits)], %l1
 		or			%l0, %l1, %l0
-		stxa			%l0, [%l7] ASI_DMMU
-		flush			%g6
+
+661:		stxa			%l0, [%l7] ASI_DMMU
+		.section		.sun4v_1insn_patch, "ax"
+		.word			661b
+		stxa			%l0, [%l7] ASI_MMU
+		.previous
+
+		sethi			%hi(KERNBASE), %l7
+		flush			%l7
 		rdpr			%wstate, %l1
 		rdpr			%otherwin, %l2
 		srl			%l1, 3, %l1
 
 		wrpr			%l2, %g0, %canrestore
 		wrpr			%l1, %g0, %wstate
-		wrpr			%g0, %g0, %otherwin
+		brnz,pt			%l2, user_rtt_restore
+		 wrpr			%g0, %g0, %otherwin
+
+		ldx			[%g6 + TI_FLAGS], %g3
+		wr			%g0, ASI_AIUP, %asi
+		rdpr			%cwp, %g1
+		andcc			%g3, _TIF_32BIT, %g0
+		sub			%g1, 1, %g1
+		bne,pt			%xcc, user_rtt_fill_32bit
+		 wrpr			%g1, %cwp
+		ba,a,pt			%xcc, user_rtt_fill_64bit
+
+user_rtt_fill_fixup:
+		rdpr	%cwp, %g1
+		add	%g1, 1, %g1
+		wrpr	%g1, 0x0, %cwp
+
+		rdpr	%wstate, %g2
+		sll	%g2, 3, %g2
+		wrpr	%g2, 0x0, %wstate
+
+		/* We know %canrestore and %otherwin are both zero.  */
+
+		sethi	%hi(sparc64_kern_pri_context), %g2
+		ldx	[%g2 + %lo(sparc64_kern_pri_context)], %g2
+		mov	PRIMARY_CONTEXT, %g1
+
+661:		stxa	%g2, [%g1] ASI_DMMU
+		.section .sun4v_1insn_patch, "ax"
+		.word	661b
+		stxa	%g2, [%g1] ASI_MMU
+		.previous
+
+		sethi	%hi(KERNBASE), %g1
+		flush	%g1
+
+		or	%g4, FAULT_CODE_WINFIXUP, %g4
+		stb	%g4, [%g6 + TI_FAULT_CODE]
+		stx	%g5, [%g6 + TI_FAULT_ADDR]
+
+		mov	%g6, %l1
+		wrpr	%g0, 0x0, %tl
+
+661:		nop
+		.section		.sun4v_1insn_patch, "ax"
+		.word			661b
+		SET_GL(0)
+		.previous
+
+		wrpr	%g0, RTRAP_PSTATE, %pstate
+
+		mov	%l1, %g6
+		ldx	[%g6 + TI_TASK], %g4
+		LOAD_PER_CPU_BASE(%g5, %g6, %g1, %g2, %g3)
+		call	do_sparc64_fault
+		 add	%sp, PTREGS_OFF, %o0
+		ba,pt	%xcc, rtrap
+		 nop
+
+user_rtt_pre_restore:
+		add			%g1, 1, %g1
+		wrpr			%g1, 0x0, %cwp
+
+user_rtt_restore:
 		restore
 		rdpr			%canrestore, %g1
 		wrpr			%g1, 0x0, %cleanwin
 		retry
 		nop
 
-kern_rtt:	restore
+kern_rtt:	rdpr			%canrestore, %g1
+		brz,pn			%g1, kern_rtt_fill
+		 nop
+kern_rtt_restore:
+		restore
 		retry
+
 to_kernel:
 #ifdef CONFIG_PREEMPT
 		ldsw			[%g6 + TI_PRE_COUNT], %l5
diff --git a/arch/sparc64/kernel/sbus.c b/arch/sparc64/kernel/sbus.c
index d95a1bcf163d..1d6ffdeabd4c 100644
--- a/arch/sparc64/kernel/sbus.c
+++ b/arch/sparc64/kernel/sbus.c
@@ -693,11 +693,11 @@ void sbus_set_sbus64(struct sbus_dev *sdev, int bursts)
 
 /* SBUS SYSIO INO number to Sparc PIL level. */
 static unsigned char sysio_ino_to_pil[] = {
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 0 */
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 1 */
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 2 */
-	0, 4, 4, 7, 5, 7, 8, 9,		/* SBUS slot 3 */
-	4, /* Onboard SCSI */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 0 */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 1 */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 2 */
+	0, 5, 5, 7, 5, 7, 8, 9,		/* SBUS slot 3 */
+	5, /* Onboard SCSI */
 	5, /* Onboard Ethernet */
 /*XXX*/	8, /* Onboard BPP */
 	0, /* Bogon */
diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c
index 158bd31e15b7..7d0e67c1ce50 100644
--- a/arch/sparc64/kernel/setup.c
+++ b/arch/sparc64/kernel/setup.c
@@ -64,12 +64,6 @@ struct screen_info screen_info = {
 	16                      /* orig-video-points */
 };
 
-/* Typing sync at the prom prompt calls the function pointed to by
- * the sync callback which I set to the following function.
- * This should sync all filesystems and return, for now it just
- * prints out pretty messages and returns.
- */
-
 void (*prom_palette)(int);
 void (*prom_keyboard)(void);
 
@@ -79,259 +73,6 @@ prom_console_write(struct console *con, const char *s, unsigned n)
 	prom_write(s, n);
 }
 
-static struct console prom_console = {
-	.name =		"prom",
-	.write =	prom_console_write,
-	.flags =	CON_CONSDEV | CON_ENABLED,
-	.index =	-1,
-};
-
-#define PROM_TRUE	-1
-#define PROM_FALSE	0
-
-/* Pretty sick eh? */
-int prom_callback(long *args)
-{
-	struct console *cons, *saved_console = NULL;
-	unsigned long flags;
-	char *cmd;
-	extern spinlock_t prom_entry_lock;
-
-	if (!args)
-		return -1;
-	if (!(cmd = (char *)args[0]))
-		return -1;
-
-	/*
-	 * The callback can be invoked on the cpu that first dropped 
-	 * into prom_cmdline after taking the serial interrupt, or on 
-	 * a slave processor that was smp_captured() if the 
-	 * administrator has done a switch-cpu inside obp. In either 
-	 * case, the cpu is marked as in-interrupt. Drop IRQ locks.
-	 */
-	irq_exit();
-
-	/* XXX Revisit the locking here someday.  This is a debugging
-	 * XXX feature so it isnt all that critical.  -DaveM
-	 */
-	local_irq_save(flags);
-
-	spin_unlock(&prom_entry_lock);
-	cons = console_drivers;
-	while (cons) {
-		unregister_console(cons);
-		cons->flags &= ~(CON_PRINTBUFFER);
-		cons->next = saved_console;
-		saved_console = cons;
-		cons = console_drivers;
-	}
-	register_console(&prom_console);
-	if (!strcmp(cmd, "sync")) {
-		prom_printf("PROM `%s' command...\n", cmd);
-		show_free_areas();
-		if (current->pid != 0) {
-			local_irq_enable();
-			sys_sync();
-			local_irq_disable();
-		}
-		args[2] = 0;
-		args[args[1] + 3] = -1;
-		prom_printf("Returning to PROM\n");
-	} else if (!strcmp(cmd, "va>tte-data")) {
-		unsigned long ctx, va;
-		unsigned long tte = 0;
-		long res = PROM_FALSE;
-
-		ctx = args[3];
-		va = args[4];
-		if (ctx) {
-			/*
-			 * Find process owning ctx, lookup mapping.
-			 */
-			struct task_struct *p;
-			struct mm_struct *mm = NULL;
-			pgd_t *pgdp;
-			pud_t *pudp;
-			pmd_t *pmdp;
-			pte_t *ptep;
-			pte_t pte;
-
-			for_each_process(p) {
-				mm = p->mm;
-				if (CTX_NRBITS(mm->context) == ctx)
-					break;
-			}
-			if (!mm ||
-			    CTX_NRBITS(mm->context) != ctx)
-				goto done;
-
-			pgdp = pgd_offset(mm, va);
-			if (pgd_none(*pgdp))
-				goto done;
-			pudp = pud_offset(pgdp, va);
-			if (pud_none(*pudp))
-				goto done;
-			pmdp = pmd_offset(pudp, va);
-			if (pmd_none(*pmdp))
-				goto done;
-
-			/* Preemption implicitly disabled by virtue of
-			 * being called from inside OBP.
-			 */
-			ptep = pte_offset_map(pmdp, va);
-			pte = *ptep;
-			if (pte_present(pte)) {
-				tte = pte_val(pte);
-				res = PROM_TRUE;
-			}
-			pte_unmap(ptep);
-			goto done;
-		}
-
-		if ((va >= KERNBASE) && (va < (KERNBASE + (4 * 1024 * 1024)))) {
-			extern unsigned long sparc64_kern_pri_context;
-
-			/* Spitfire Errata #32 workaround */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (sparc64_kern_pri_context),
-					       "r" (PRIMARY_CONTEXT),
-					       "i" (ASI_DMMU));
-
-			/*
-			 * Locked down tlb entry.
-			 */
-
-			if (tlb_type == spitfire)
-				tte = spitfire_get_dtlb_data(SPITFIRE_HIGHEST_LOCKED_TLBENT);
-			else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-				tte = cheetah_get_ldtlb_data(CHEETAH_HIGHEST_LOCKED_TLBENT);
-
-			res = PROM_TRUE;
-			goto done;
-		}
-
-		if (va < PGDIR_SIZE) {
-			/*
-			 * vmalloc or prom_inherited mapping.
-			 */
-			pgd_t *pgdp;
-			pud_t *pudp;
-			pmd_t *pmdp;
-			pte_t *ptep;
-			pte_t pte;
-			int error;
-
-			if ((va >= LOW_OBP_ADDRESS) && (va < HI_OBP_ADDRESS)) {
-				tte = prom_virt_to_phys(va, &error);
-				if (!error)
-					res = PROM_TRUE;
-				goto done;
-			}
-			pgdp = pgd_offset_k(va);
-			if (pgd_none(*pgdp))
-				goto done;
-			pudp = pud_offset(pgdp, va);
-			if (pud_none(*pudp))
-				goto done;
-			pmdp = pmd_offset(pudp, va);
-			if (pmd_none(*pmdp))
-				goto done;
-
-			/* Preemption implicitly disabled by virtue of
-			 * being called from inside OBP.
-			 */
-			ptep = pte_offset_kernel(pmdp, va);
-			pte = *ptep;
-			if (pte_present(pte)) {
-				tte = pte_val(pte);
-				res = PROM_TRUE;
-			}
-			goto done;
-		}
-
-		if (va < PAGE_OFFSET) {
-			/*
-			 * No mappings here.
-			 */
-			goto done;
-		}
-
-		if (va & (1UL << 40)) {
-			/*
-			 * I/O page.
-			 */
-
-			tte = (__pa(va) & _PAGE_PADDR) |
-			      _PAGE_VALID | _PAGE_SZ4MB |
-			      _PAGE_E | _PAGE_P | _PAGE_W;
-			res = PROM_TRUE;
-			goto done;
-		}
-
-		/*
-		 * Normal page.
-		 */
-		tte = (__pa(va) & _PAGE_PADDR) |
-		      _PAGE_VALID | _PAGE_SZ4MB |
-		      _PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W;
-		res = PROM_TRUE;
-
-	done:
-		if (res == PROM_TRUE) {
-			args[2] = 3;
-			args[args[1] + 3] = 0;
-			args[args[1] + 4] = res;
-			args[args[1] + 5] = tte;
-		} else {
-			args[2] = 2;
-			args[args[1] + 3] = 0;
-			args[args[1] + 4] = res;
-		}
-	} else if (!strcmp(cmd, ".soft1")) {
-		unsigned long tte;
-
-		tte = args[3];
-		prom_printf("%lx:\"%s%s%s%s%s\" ",
-			    (tte & _PAGE_SOFT) >> 7,
-			    tte & _PAGE_MODIFIED ? "M" : "-",
-			    tte & _PAGE_ACCESSED ? "A" : "-",
-			    tte & _PAGE_READ     ? "W" : "-",
-			    tte & _PAGE_WRITE    ? "R" : "-",
-			    tte & _PAGE_PRESENT  ? "P" : "-");
-
-		args[2] = 2;
-		args[args[1] + 3] = 0;
-		args[args[1] + 4] = PROM_TRUE;
-	} else if (!strcmp(cmd, ".soft2")) {
-		unsigned long tte;
-
-		tte = args[3];
-		prom_printf("%lx ", (tte & 0x07FC000000000000UL) >> 50);
-
-		args[2] = 2;
-		args[args[1] + 3] = 0;
-		args[args[1] + 4] = PROM_TRUE;
-	} else {
-		prom_printf("unknown PROM `%s' command...\n", cmd);
-	}
-	unregister_console(&prom_console);
-	while (saved_console) {
-		cons = saved_console;
-		saved_console = cons->next;
-		register_console(cons);
-	}
-	spin_lock(&prom_entry_lock);
-	local_irq_restore(flags);
-
-	/*
-	 * Restore in-interrupt status for a resume from obp.
-	 */
-	irq_enter();
-	return 0;
-}
-
 unsigned int boot_flags = 0;
 #define BOOTME_DEBUG  0x1
 #define BOOTME_SINGLE 0x2
@@ -479,15 +220,99 @@ char reboot_command[COMMAND_LINE_SIZE];
 
 static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 };
 
-void register_prom_callbacks(void)
+static void __init per_cpu_patch(void)
 {
-	prom_setcallback(prom_callback);
-	prom_feval(": linux-va>tte-data 2 \" va>tte-data\" $callback drop ; "
-		   "' linux-va>tte-data to va>tte-data");
-	prom_feval(": linux-.soft1 1 \" .soft1\" $callback 2drop ; "
-		   "' linux-.soft1 to .soft1");
-	prom_feval(": linux-.soft2 1 \" .soft2\" $callback 2drop ; "
-		   "' linux-.soft2 to .soft2");
+	struct cpuid_patch_entry *p;
+	unsigned long ver;
+	int is_jbus;
+
+	if (tlb_type == spitfire && !this_is_starfire)
+		return;
+
+	is_jbus = 0;
+	if (tlb_type != hypervisor) {
+		__asm__ ("rdpr %%ver, %0" : "=r" (ver));
+		is_jbus = ((ver >> 32UL) == __JALAPENO_ID ||
+			   (ver >> 32UL) == __SERRANO_ID);
+	}
+
+	p = &__cpuid_patch;
+	while (p < &__cpuid_patch_end) {
+		unsigned long addr = p->addr;
+		unsigned int *insns;
+
+		switch (tlb_type) {
+		case spitfire:
+			insns = &p->starfire[0];
+			break;
+		case cheetah:
+		case cheetah_plus:
+			if (is_jbus)
+				insns = &p->cheetah_jbus[0];
+			else
+				insns = &p->cheetah_safari[0];
+			break;
+		case hypervisor:
+			insns = &p->sun4v[0];
+			break;
+		default:
+			prom_printf("Unknown cpu type, halting.\n");
+			prom_halt();
+		};
+
+		*(unsigned int *) (addr +  0) = insns[0];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  0));
+
+		*(unsigned int *) (addr +  4) = insns[1];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  4));
+
+		*(unsigned int *) (addr +  8) = insns[2];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  8));
+
+		*(unsigned int *) (addr + 12) = insns[3];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr + 12));
+
+		p++;
+	}
+}
+
+static void __init sun4v_patch(void)
+{
+	struct sun4v_1insn_patch_entry *p1;
+	struct sun4v_2insn_patch_entry *p2;
+
+	if (tlb_type != hypervisor)
+		return;
+
+	p1 = &__sun4v_1insn_patch;
+	while (p1 < &__sun4v_1insn_patch_end) {
+		unsigned long addr = p1->addr;
+
+		*(unsigned int *) (addr +  0) = p1->insn;
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  0));
+
+		p1++;
+	}
+
+	p2 = &__sun4v_2insn_patch;
+	while (p2 < &__sun4v_2insn_patch_end) {
+		unsigned long addr = p2->addr;
+
+		*(unsigned int *) (addr +  0) = p2->insns[0];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  0));
+
+		*(unsigned int *) (addr +  4) = p2->insns[1];
+		wmb();
+		__asm__ __volatile__("flush	%0" : : "r" (addr +  4));
+
+		p2++;
+	}
 }
 
 void __init setup_arch(char **cmdline_p)
@@ -496,7 +321,10 @@ void __init setup_arch(char **cmdline_p)
 	*cmdline_p = prom_getbootargs();
 	strcpy(saved_command_line, *cmdline_p);
 
-	printk("ARCH: SUN4U\n");
+	if (tlb_type == hypervisor)
+		printk("ARCH: SUN4V\n");
+	else
+		printk("ARCH: SUN4U\n");
 
 #ifdef CONFIG_DUMMY_CONSOLE
 	conswitchp = &dummy_con;
@@ -507,6 +335,13 @@ void __init setup_arch(char **cmdline_p)
 	/* Work out if we are starfire early on */
 	check_if_starfire();
 
+	/* Now we know enough to patch the get_cpuid sequences
+	 * used by trap code.
+	 */
+	per_cpu_patch();
+
+	sun4v_patch();
+
 	boot_flags_init(*cmdline_p);
 
 	idprom_init();
@@ -514,7 +349,7 @@ void __init setup_arch(char **cmdline_p)
 	if (!root_flags)
 		root_mountflags &= ~MS_RDONLY;
 	ROOT_DEV = old_decode_dev(root_dev);
-#ifdef CONFIG_BLK_DEV_INITRD
+#ifdef CONFIG_BLK_DEV_RAM
 	rd_image_start = ram_flags & RAMDISK_IMAGE_START_MASK;
 	rd_prompt = ((ram_flags & RAMDISK_PROMPT_FLAG) != 0);
 	rd_doload = ((ram_flags & RAMDISK_LOAD_FLAG) != 0);	
@@ -544,6 +379,9 @@ void __init setup_arch(char **cmdline_p)
 
 	smp_setup_cpu_possible_map();
 
+	/* Get boot processor trap_block[] setup.  */
+	init_cur_cpu_trap(current_thread_info());
+
 	paging_init();
 }
 
@@ -565,6 +403,12 @@ static int __init set_preferred_console(void)
 		serial_console = 2;
 	} else if (idev == PROMDEV_IRSC && odev == PROMDEV_ORSC) {
 		serial_console = 3;
+	} else if (idev == PROMDEV_IVCONS && odev == PROMDEV_OVCONS) {
+		/* sunhv_console_init() doesn't check the serial_console
+		 * value anyways...
+		 */
+		serial_console = 4;
+		return add_preferred_console("ttyHV", 0, NULL);
 	} else {
 		prom_printf("Inconsistent console: "
 			    "input %d, output %d\n",
@@ -598,9 +442,8 @@ static int show_cpuinfo(struct seq_file *m, void *__unused)
 	seq_printf(m, 
 		   "cpu\t\t: %s\n"
 		   "fpu\t\t: %s\n"
-		   "promlib\t\t: Version 3 Revision %d\n"
-		   "prom\t\t: %d.%d.%d\n"
-		   "type\t\t: sun4u\n"
+		   "prom\t\t: %s\n"
+		   "type\t\t: %s\n"
 		   "ncpus probed\t: %d\n"
 		   "ncpus active\t: %d\n"
 		   "D$ parity tl1\t: %u\n"
@@ -612,10 +455,10 @@ static int show_cpuinfo(struct seq_file *m, void *__unused)
 		   ,
 		   sparc_cpu_type,
 		   sparc_fpu_type,
-		   prom_rev,
-		   prom_prev >> 16,
-		   (prom_prev >> 8) & 0xff,
-		   prom_prev & 0xff,
+		   prom_version,
+		   ((tlb_type == hypervisor) ?
+		    "sun4v" :
+		    "sun4u"),
 		   ncpus_probed,
 		   num_online_cpus(),
 		   dcache_parity_tl1_occurred,
@@ -692,15 +535,11 @@ static int __init topology_init(void)
 	while (!cpu_find_by_instance(ncpus_probed, NULL, NULL))
 		ncpus_probed++;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_possible(i)) {
-			struct cpu *p = kmalloc(sizeof(*p), GFP_KERNEL);
-
-			if (p) {
-				memset(p, 0, sizeof(*p));
-				register_cpu(p, i, NULL);
-				err = 0;
-			}
+	for_each_cpu(i) {
+		struct cpu *p = kzalloc(sizeof(*p), GFP_KERNEL);
+		if (p) {
+			register_cpu(p, i, NULL);
+			err = 0;
 		}
 	}
 
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 1f7ad8a69052..7dc28a484268 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -38,6 +38,7 @@
 #include <asm/timer.h>
 #include <asm/starfire.h>
 #include <asm/tlb.h>
+#include <asm/sections.h>
 
 extern void calibrate_delay(void);
 
@@ -46,6 +47,8 @@ static unsigned char boot_cpu_id;
 
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
 cpumask_t phys_cpu_present_map __read_mostly = CPU_MASK_NONE;
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly =
+	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 static cpumask_t smp_commenced_mask;
 static cpumask_t cpu_callout_map;
 
@@ -54,30 +57,26 @@ void smp_info(struct seq_file *m)
 	int i;
 	
 	seq_printf(m, "State:\n");
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i))
-			seq_printf(m,
-				   "CPU%d:\t\tonline\n", i);
-	}
+	for_each_online_cpu(i)
+		seq_printf(m, "CPU%d:\t\tonline\n", i);
 }
 
 void smp_bogo(struct seq_file *m)
 {
 	int i;
 	
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_online(i))
-			seq_printf(m,
-				   "Cpu%dBogo\t: %lu.%02lu\n"
-				   "Cpu%dClkTck\t: %016lx\n",
-				   i, cpu_data(i).udelay_val / (500000/HZ),
-				   (cpu_data(i).udelay_val / (5000/HZ)) % 100,
-				   i, cpu_data(i).clock_tick);
+	for_each_online_cpu(i)
+		seq_printf(m,
+			   "Cpu%dBogo\t: %lu.%02lu\n"
+			   "Cpu%dClkTck\t: %016lx\n",
+			   i, cpu_data(i).udelay_val / (500000/HZ),
+			   (cpu_data(i).udelay_val / (5000/HZ)) % 100,
+			   i, cpu_data(i).clock_tick);
 }
 
 void __init smp_store_cpu_info(int id)
 {
-	int cpu_node;
+	int cpu_node, def;
 
 	/* multiplier and counter set by
 	   smp_setup_percpu_timer()  */
@@ -87,24 +86,32 @@ void __init smp_store_cpu_info(int id)
 	cpu_data(id).clock_tick = prom_getintdefault(cpu_node,
 						     "clock-frequency", 0);
 
-	cpu_data(id).pgcache_size		= 0;
-	cpu_data(id).pte_cache[0]		= NULL;
-	cpu_data(id).pte_cache[1]		= NULL;
-	cpu_data(id).pgd_cache			= NULL;
-	cpu_data(id).idle_volume		= 1;
-
+	def = ((tlb_type == hypervisor) ? (8 * 1024) : (16 * 1024));
 	cpu_data(id).dcache_size = prom_getintdefault(cpu_node, "dcache-size",
-						      16 * 1024);
+						      def);
+
+	def = 32;
 	cpu_data(id).dcache_line_size =
-		prom_getintdefault(cpu_node, "dcache-line-size", 32);
+		prom_getintdefault(cpu_node, "dcache-line-size", def);
+
+	def = 16 * 1024;
 	cpu_data(id).icache_size = prom_getintdefault(cpu_node, "icache-size",
-						      16 * 1024);
+						      def);
+
+	def = 32;
 	cpu_data(id).icache_line_size =
-		prom_getintdefault(cpu_node, "icache-line-size", 32);
+		prom_getintdefault(cpu_node, "icache-line-size", def);
+
+	def = ((tlb_type == hypervisor) ?
+	       (3 * 1024 * 1024) :
+	       (4 * 1024 * 1024));
 	cpu_data(id).ecache_size = prom_getintdefault(cpu_node, "ecache-size",
-						      4 * 1024 * 1024);
+						      def);
+
+	def = 64;
 	cpu_data(id).ecache_line_size =
-		prom_getintdefault(cpu_node, "ecache-line-size", 64);
+		prom_getintdefault(cpu_node, "ecache-line-size", def);
+
 	printk("CPU[%d]: Caches "
 	       "D[sz(%d):line_sz(%d)] "
 	       "I[sz(%d):line_sz(%d)] "
@@ -119,27 +126,16 @@ static void smp_setup_percpu_timer(void);
 
 static volatile unsigned long callin_flag = 0;
 
-extern void inherit_locked_prom_mappings(int save_p);
-
-static inline void cpu_setup_percpu_base(unsigned long cpu_id)
-{
-	__asm__ __volatile__("mov	%0, %%g5\n\t"
-			     "stxa	%0, [%1] %2\n\t"
-			     "membar	#Sync"
-			     : /* no outputs */
-			     : "r" (__per_cpu_offset(cpu_id)),
-			       "r" (TSB_REG), "i" (ASI_IMMU));
-}
-
 void __init smp_callin(void)
 {
 	int cpuid = hard_smp_processor_id();
 
-	inherit_locked_prom_mappings(0);
+	__local_per_cpu_offset = __per_cpu_offset(cpuid);
 
-	__flush_tlb_all();
+	if (tlb_type == hypervisor)
+		sun4v_ktsb_register();
 
-	cpu_setup_percpu_base(cpuid);
+	__flush_tlb_all();
 
 	smp_setup_percpu_timer();
 
@@ -316,6 +312,8 @@ static void smp_synchronize_one_tick(int cpu)
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
 }
 
+extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load);
+
 extern unsigned long sparc64_cpu_startup;
 
 /* The OBP cpu startup callback truncates the 3rd arg cookie to
@@ -331,21 +329,31 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu)
 	unsigned long cookie =
 		(unsigned long)(&cpu_new_thread);
 	struct task_struct *p;
-	int timeout, ret, cpu_node;
+	int timeout, ret;
 
 	p = fork_idle(cpu);
 	callin_flag = 0;
 	cpu_new_thread = task_thread_info(p);
 	cpu_set(cpu, cpu_callout_map);
 
-	cpu_find_by_mid(cpu, &cpu_node);
-	prom_startcpu(cpu_node, entry, cookie);
+	if (tlb_type == hypervisor) {
+		/* Alloc the mondo queues, cpu will load them.  */
+		sun4v_init_mondo_queues(0, cpu, 1, 0);
+
+		prom_startcpu_cpuid(cpu, entry, cookie);
+	} else {
+		int cpu_node;
+
+		cpu_find_by_mid(cpu, &cpu_node);
+		prom_startcpu(cpu_node, entry, cookie);
+	}
 
 	for (timeout = 0; timeout < 5000000; timeout++) {
 		if (callin_flag)
 			break;
 		udelay(100);
 	}
+
 	if (callin_flag) {
 		ret = 0;
 	} else {
@@ -441,7 +449,7 @@ static __inline__ void spitfire_xcall_deliver(u64 data0, u64 data1, u64 data2, c
 static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
 {
 	u64 pstate, ver;
-	int nack_busy_id, is_jalapeno;
+	int nack_busy_id, is_jbus;
 
 	if (cpus_empty(mask))
 		return;
@@ -451,7 +459,8 @@ static void cheetah_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mas
 	 * derivative processor.
 	 */
 	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
-	is_jalapeno = ((ver >> 32) == 0x003e0016);
+	is_jbus = ((ver >> 32) == __JALAPENO_ID ||
+		   (ver >> 32) == __SERRANO_ID);
 
 	__asm__ __volatile__("rdpr %%pstate, %0" : "=r" (pstate));
 
@@ -476,7 +485,7 @@ retry:
 		for_each_cpu_mask(i, mask) {
 			u64 target = (i << 14) | 0x70;
 
-			if (!is_jalapeno)
+			if (!is_jbus)
 				target |= (nack_busy_id << 24);
 			__asm__ __volatile__(
 				"stxa	%%g0, [%0] %1\n\t"
@@ -529,7 +538,7 @@ retry:
 			for_each_cpu_mask(i, mask) {
 				u64 check_mask;
 
-				if (is_jalapeno)
+				if (is_jbus)
 					check_mask = (0x2UL << (2*i));
 				else
 					check_mask = (0x2UL <<
@@ -544,6 +553,155 @@ retry:
 	}
 }
 
+/* Multi-cpu list version.  */
+static void hypervisor_xcall_deliver(u64 data0, u64 data1, u64 data2, cpumask_t mask)
+{
+	struct trap_per_cpu *tb;
+	u16 *cpu_list;
+	u64 *mondo;
+	cpumask_t error_mask;
+	unsigned long flags, status;
+	int cnt, retries, this_cpu, prev_sent, i;
+
+	/* We have to do this whole thing with interrupts fully disabled.
+	 * Otherwise if we send an xcall from interrupt context it will
+	 * corrupt both our mondo block and cpu list state.
+	 *
+	 * One consequence of this is that we cannot use timeout mechanisms
+	 * that depend upon interrupts being delivered locally.  So, for
+	 * example, we cannot sample jiffies and expect it to advance.
+	 *
+	 * Fortunately, udelay() uses %stick/%tick so we can use that.
+	 */
+	local_irq_save(flags);
+
+	this_cpu = smp_processor_id();
+	tb = &trap_block[this_cpu];
+
+	mondo = __va(tb->cpu_mondo_block_pa);
+	mondo[0] = data0;
+	mondo[1] = data1;
+	mondo[2] = data2;
+	wmb();
+
+	cpu_list = __va(tb->cpu_list_pa);
+
+	/* Setup the initial cpu list.  */
+	cnt = 0;
+	for_each_cpu_mask(i, mask)
+		cpu_list[cnt++] = i;
+
+	cpus_clear(error_mask);
+	retries = 0;
+	prev_sent = 0;
+	do {
+		int forward_progress, n_sent;
+
+		status = sun4v_cpu_mondo_send(cnt,
+					      tb->cpu_list_pa,
+					      tb->cpu_mondo_block_pa);
+
+		/* HV_EOK means all cpus received the xcall, we're done.  */
+		if (likely(status == HV_EOK))
+			break;
+
+		/* First, see if we made any forward progress.
+		 *
+		 * The hypervisor indicates successful sends by setting
+		 * cpu list entries to the value 0xffff.
+		 */
+		n_sent = 0;
+		for (i = 0; i < cnt; i++) {
+			if (likely(cpu_list[i] == 0xffff))
+				n_sent++;
+		}
+
+		forward_progress = 0;
+		if (n_sent > prev_sent)
+			forward_progress = 1;
+
+		prev_sent = n_sent;
+
+		/* If we get a HV_ECPUERROR, then one or more of the cpus
+		 * in the list are in error state.  Use the cpu_state()
+		 * hypervisor call to find out which cpus are in error state.
+		 */
+		if (unlikely(status == HV_ECPUERROR)) {
+			for (i = 0; i < cnt; i++) {
+				long err;
+				u16 cpu;
+
+				cpu = cpu_list[i];
+				if (cpu == 0xffff)
+					continue;
+
+				err = sun4v_cpu_state(cpu);
+				if (err >= 0 &&
+				    err == HV_CPU_STATE_ERROR) {
+					cpu_list[i] = 0xffff;
+					cpu_set(cpu, error_mask);
+				}
+			}
+		} else if (unlikely(status != HV_EWOULDBLOCK))
+			goto fatal_mondo_error;
+
+		/* Don't bother rewriting the CPU list, just leave the
+		 * 0xffff and non-0xffff entries in there and the
+		 * hypervisor will do the right thing.
+		 *
+		 * Only advance timeout state if we didn't make any
+		 * forward progress.
+		 */
+		if (unlikely(!forward_progress)) {
+			if (unlikely(++retries > 10000))
+				goto fatal_mondo_timeout;
+
+			/* Delay a little bit to let other cpus catch up
+			 * on their cpu mondo queue work.
+			 */
+			udelay(2 * cnt);
+		}
+	} while (1);
+
+	local_irq_restore(flags);
+
+	if (unlikely(!cpus_empty(error_mask)))
+		goto fatal_mondo_cpu_error;
+
+	return;
+
+fatal_mondo_cpu_error:
+	printk(KERN_CRIT "CPU[%d]: SUN4V mondo cpu error, some target cpus "
+	       "were in error state\n",
+	       this_cpu);
+	printk(KERN_CRIT "CPU[%d]: Error mask [ ", this_cpu);
+	for_each_cpu_mask(i, error_mask)
+		printk("%d ", i);
+	printk("]\n");
+	return;
+
+fatal_mondo_timeout:
+	local_irq_restore(flags);
+	printk(KERN_CRIT "CPU[%d]: SUN4V mondo timeout, no forward "
+	       " progress after %d retries.\n",
+	       this_cpu, retries);
+	goto dump_cpu_list_and_out;
+
+fatal_mondo_error:
+	local_irq_restore(flags);
+	printk(KERN_CRIT "CPU[%d]: Unexpected SUN4V mondo error %lu\n",
+	       this_cpu, status);
+	printk(KERN_CRIT "CPU[%d]: Args were cnt(%d) cpulist_pa(%lx) "
+	       "mondo_block_pa(%lx)\n",
+	       this_cpu, cnt, tb->cpu_list_pa, tb->cpu_mondo_block_pa);
+
+dump_cpu_list_and_out:
+	printk(KERN_CRIT "CPU[%d]: CPU list [ ", this_cpu);
+	for (i = 0; i < cnt; i++)
+		printk("%u ", cpu_list[i]);
+	printk("]\n");
+}
+
 /* Send cross call to all processors mentioned in MASK
  * except self.
  */
@@ -557,8 +715,10 @@ static void smp_cross_call_masked(unsigned long *func, u32 ctx, u64 data1, u64 d
 
 	if (tlb_type == spitfire)
 		spitfire_xcall_deliver(data0, data1, data2, mask);
-	else
+	else if (tlb_type == cheetah || tlb_type == cheetah_plus)
 		cheetah_xcall_deliver(data0, data1, data2, mask);
+	else
+		hypervisor_xcall_deliver(data0, data1, data2, mask);
 	/* NOTE: Caller runs local copy on master. */
 
 	put_cpu();
@@ -594,16 +754,13 @@ extern unsigned long xcall_call_function;
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func)(void *info), void *info,
-		      int nonatomic, int wait)
+static int smp_call_function_mask(void (*func)(void *info), void *info,
+				  int nonatomic, int wait, cpumask_t mask)
 {
 	struct call_data_struct data;
-	int cpus = num_online_cpus() - 1;
+	int cpus;
 	long timeout;
 
-	if (!cpus)
-		return 0;
-
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
@@ -614,9 +771,14 @@ int smp_call_function(void (*func)(void *info), void *info,
 
 	spin_lock(&call_lock);
 
+	cpu_clear(smp_processor_id(), mask);
+	cpus = cpus_weight(mask);
+	if (!cpus)
+		goto out_unlock;
+
 	call_data = &data;
 
-	smp_cross_call(&xcall_call_function, 0, 0, 0);
+	smp_cross_call_masked(&xcall_call_function, 0, 0, 0, mask);
 
 	/* 
 	 * Wait for other cpus to complete function or at
@@ -630,18 +792,25 @@ int smp_call_function(void (*func)(void *info), void *info,
 		udelay(1);
 	}
 
+out_unlock:
 	spin_unlock(&call_lock);
 
 	return 0;
 
 out_timeout:
 	spin_unlock(&call_lock);
-	printk("XCALL: Remote cpus not responding, ncpus=%ld finished=%ld\n",
-	       (long) num_online_cpus() - 1L,
-	       (long) atomic_read(&data.finished));
+	printk("XCALL: Remote cpus not responding, ncpus=%d finished=%d\n",
+	       cpus, atomic_read(&data.finished));
 	return 0;
 }
 
+int smp_call_function(void (*func)(void *info), void *info,
+		      int nonatomic, int wait)
+{
+	return smp_call_function_mask(func, info, nonatomic, wait,
+				      cpu_online_map);
+}
+
 void smp_call_function_client(int irq, struct pt_regs *regs)
 {
 	void (*func) (void *info) = call_data->func;
@@ -659,13 +828,25 @@ void smp_call_function_client(int irq, struct pt_regs *regs)
 	}
 }
 
+static void tsb_sync(void *info)
+{
+	struct mm_struct *mm = info;
+
+	if (current->active_mm == mm)
+		tsb_context_switch(mm);
+}
+
+void smp_tsb_sync(struct mm_struct *mm)
+{
+	smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
+}
+
 extern unsigned long xcall_flush_tlb_mm;
 extern unsigned long xcall_flush_tlb_pending;
 extern unsigned long xcall_flush_tlb_kernel_range;
-extern unsigned long xcall_flush_tlb_all_spitfire;
-extern unsigned long xcall_flush_tlb_all_cheetah;
 extern unsigned long xcall_report_regs;
 extern unsigned long xcall_receive_signal;
+extern unsigned long xcall_new_mmu_context_version;
 
 #ifdef DCACHE_ALIASING_POSSIBLE
 extern unsigned long xcall_flush_dcache_page_cheetah;
@@ -693,11 +874,17 @@ static __inline__ void __local_flush_dcache_page(struct page *page)
 void smp_flush_dcache_page_impl(struct page *page, int cpu)
 {
 	cpumask_t mask = cpumask_of_cpu(cpu);
-	int this_cpu = get_cpu();
+	int this_cpu;
+
+	if (tlb_type == hypervisor)
+		return;
 
 #ifdef CONFIG_DEBUG_DCFLUSH
 	atomic_inc(&dcpage_flushes);
 #endif
+
+	this_cpu = get_cpu();
+
 	if (cpu == this_cpu) {
 		__local_flush_dcache_page(page);
 	} else if (cpu_online(cpu)) {
@@ -713,7 +900,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 					       __pa(pg_addr),
 					       (u64) pg_addr,
 					       mask);
-		} else {
+		} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
 			data0 =
 				((u64)&xcall_flush_dcache_page_cheetah);
@@ -735,7 +922,12 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	void *pg_addr = page_address(page);
 	cpumask_t mask = cpu_online_map;
 	u64 data0;
-	int this_cpu = get_cpu();
+	int this_cpu;
+
+	if (tlb_type == hypervisor)
+		return;
+
+	this_cpu = get_cpu();
 
 	cpu_clear(this_cpu, mask);
 
@@ -752,7 +944,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 				       __pa(pg_addr),
 				       (u64) pg_addr,
 				       mask);
-	} else {
+	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 #ifdef DCACHE_ALIASING_POSSIBLE
 		data0 = ((u64)&xcall_flush_dcache_page_cheetah);
 		cheetah_xcall_deliver(data0,
@@ -769,38 +961,58 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 	put_cpu();
 }
 
+static void __smp_receive_signal_mask(cpumask_t mask)
+{
+	smp_cross_call_masked(&xcall_receive_signal, 0, 0, 0, mask);
+}
+
 void smp_receive_signal(int cpu)
 {
 	cpumask_t mask = cpumask_of_cpu(cpu);
 
-	if (cpu_online(cpu)) {
-		u64 data0 = (((u64)&xcall_receive_signal) & 0xffffffff);
-
-		if (tlb_type == spitfire)
-			spitfire_xcall_deliver(data0, 0, 0, mask);
-		else
-			cheetah_xcall_deliver(data0, 0, 0, mask);
-	}
+	if (cpu_online(cpu))
+		__smp_receive_signal_mask(mask);
 }
 
 void smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
-	/* Just return, rtrap takes care of the rest. */
 	clear_softint(1 << irq);
 }
 
-void smp_report_regs(void)
+void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs)
 {
-	smp_cross_call(&xcall_report_regs, 0, 0, 0);
+	struct mm_struct *mm;
+	unsigned long flags;
+
+	clear_softint(1 << irq);
+
+	/* See if we need to allocate a new TLB context because
+	 * the version of the one we are using is now out of date.
+	 */
+	mm = current->active_mm;
+	if (unlikely(!mm || (mm == &init_mm)))
+		return;
+
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+	if (unlikely(!CTX_VALID(mm->context)))
+		get_new_mmu_context(mm);
+
+	spin_unlock_irqrestore(&mm->context.lock, flags);
+
+	load_secondary_context(mm);
+	__flush_tlb_mm(CTX_HWBITS(mm->context),
+		       SECONDARY_CONTEXT);
 }
 
-void smp_flush_tlb_all(void)
+void smp_new_mmu_context_version(void)
 {
-	if (tlb_type == spitfire)
-		smp_cross_call(&xcall_flush_tlb_all_spitfire, 0, 0, 0);
-	else
-		smp_cross_call(&xcall_flush_tlb_all_cheetah, 0, 0, 0);
-	__flush_tlb_all();
+	smp_cross_call(&xcall_new_mmu_context_version, 0, 0, 0);
+}
+
+void smp_report_regs(void)
+{
+	smp_cross_call(&xcall_report_regs, 0, 0, 0);
 }
 
 /* We know that the window frames of the user have been flushed
@@ -944,24 +1156,19 @@ void smp_release(void)
  * can service tlb flush xcalls...
  */
 extern void prom_world(int);
-extern void save_alternate_globals(unsigned long *);
-extern void restore_alternate_globals(unsigned long *);
+
 void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 {
-	unsigned long global_save[24];
-
 	clear_softint(1 << irq);
 
 	preempt_disable();
 
 	__asm__ __volatile__("flushw");
-	save_alternate_globals(global_save);
 	prom_world(1);
 	atomic_inc(&smp_capture_registry);
 	membar_storeload_storestore();
 	while (penguins_are_doing_time)
 		rmb();
-	restore_alternate_globals(global_save);
 	atomic_dec(&smp_capture_registry);
 	prom_world(0);
 
@@ -1071,7 +1278,7 @@ int setup_profiling_timer(unsigned int multiplier)
 		return -EINVAL;
 
 	spin_lock_irqsave(&prof_setup_lock, flags);
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		prof_multiplier(i) = multiplier;
 	current_tick_offset = (timer_tick_offset / multiplier);
 	spin_unlock_irqrestore(&prof_setup_lock, flags);
@@ -1082,6 +1289,8 @@ int setup_profiling_timer(unsigned int multiplier)
 /* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+	int i;
+
 	if (num_possible_cpus() > max_cpus) {
 		int instance, mid;
 
@@ -1089,6 +1298,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		while (!cpu_find_by_instance(instance, NULL, &mid)) {
 			if (mid != boot_cpu_id) {
 				cpu_clear(mid, phys_cpu_present_map);
+				cpu_clear(mid, cpu_present_map);
 				if (num_possible_cpus() <= max_cpus)
 					break;
 			}
@@ -1096,6 +1306,20 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 		}
 	}
 
+	for_each_cpu(i) {
+		if (tlb_type == hypervisor) {
+			int j;
+
+			/* XXX get this mapping from machine description */
+			for_each_cpu(j) {
+				if ((j >> 2) == (i >> 2))
+					cpu_set(j, cpu_sibling_map[i]);
+			}
+		} else {
+			cpu_set(i, cpu_sibling_map[i]);
+		}
+	}
+
 	smp_store_cpu_info(boot_cpu_id);
 }
 
@@ -1109,20 +1333,25 @@ void __init smp_setup_cpu_possible_map(void)
 
 	instance = 0;
 	while (!cpu_find_by_instance(instance, NULL, &mid)) {
-		if (mid < NR_CPUS)
+		if (mid < NR_CPUS) {
 			cpu_set(mid, phys_cpu_present_map);
+			cpu_set(mid, cpu_present_map);
+		}
 		instance++;
 	}
 }
 
 void __devinit smp_prepare_boot_cpu(void)
 {
-	if (hard_smp_processor_id() >= NR_CPUS) {
+	int cpu = hard_smp_processor_id();
+
+	if (cpu >= NR_CPUS) {
 		prom_printf("Serious problem, boot cpu id >= NR_CPUS\n");
 		prom_halt();
 	}
 
-	current_thread_info()->cpu = hard_smp_processor_id();
+	current_thread_info()->cpu = cpu;
+	__local_per_cpu_offset = __per_cpu_offset(cpu);
 
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), phys_cpu_present_map);
@@ -1139,7 +1368,11 @@ int __devinit __cpu_up(unsigned int cpu)
 		if (!cpu_isset(cpu, cpu_online_map)) {
 			ret = -ENODEV;
 		} else {
-			smp_synchronize_one_tick(cpu);
+			/* On SUN4V, writes to %tick and %stick are
+			 * not allowed.
+			 */
+			if (tlb_type != hypervisor)
+				smp_synchronize_one_tick(cpu);
 		}
 	}
 	return ret;
@@ -1150,10 +1383,8 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	unsigned long bogosum = 0;
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_online(i))
-			bogosum += cpu_data(i).udelay_val;
-	}
+	for_each_online_cpu(i)
+		bogosum += cpu_data(i).udelay_val;
 	printk("Total of %ld processors activated "
 	       "(%lu.%02lu BogoMIPS).\n",
 	       (long) num_online_cpus(),
@@ -1183,12 +1414,9 @@ void __init setup_per_cpu_areas(void)
 {
 	unsigned long goal, size, i;
 	char *ptr;
-	/* Created by linker magic */
-	extern char __per_cpu_start[], __per_cpu_end[];
 
 	/* Copy section for each CPU (we discard the original) */
-	goal = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
-
+	goal = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
 #ifdef CONFIG_MODULES
 	if (goal < PERCPU_ENOUGH_ROOM)
 		goal = PERCPU_ENOUGH_ROOM;
@@ -1197,31 +1425,10 @@ void __init setup_per_cpu_areas(void)
 	for (size = 1UL; size < goal; size <<= 1UL)
 		__per_cpu_shift++;
 
-	/* Make sure the resulting __per_cpu_base value
-	 * will fit in the 43-bit sign extended IMMU
-	 * TSB register.
-	 */
-	ptr = __alloc_bootmem(size * NR_CPUS, PAGE_SIZE,
-			      (unsigned long) __per_cpu_start);
+	ptr = alloc_bootmem(size * NR_CPUS);
 
 	__per_cpu_base = ptr - __per_cpu_start;
 
-	if ((__per_cpu_shift < PAGE_SHIFT) ||
-	    (__per_cpu_base & ~PAGE_MASK) ||
-	    (__per_cpu_base != (((long) __per_cpu_base << 20) >> 20))) {
-		prom_printf("PER_CPU: Invalid layout, "
-			    "ptr[%p] shift[%lx] base[%lx]\n",
-			    ptr, __per_cpu_shift, __per_cpu_base);
-		prom_halt();
-	}
-
 	for (i = 0; i < NR_CPUS; i++, ptr += size)
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
-	/* Finally, load in the boot cpu's base value.
-	 * We abuse the IMMU TSB register for trap handler
-	 * entry and exit loading of %g5.  That is why it
-	 * has to be page aligned.
-	 */
-	cpu_setup_percpu_base(hard_smp_processor_id());
 }
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 3c06bfb92a8c..f5e8db1de76b 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -95,9 +95,6 @@ extern int __ashrdi3(int, int);
 
 extern int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs);
 
-extern unsigned long phys_base;
-extern unsigned long pfn_base;
-
 extern unsigned int sys_call_table[];
 
 extern void xor_vis_2(unsigned long, unsigned long *, unsigned long *);
@@ -108,6 +105,14 @@ extern void xor_vis_4(unsigned long, unsigned long *, unsigned long *,
 extern void xor_vis_5(unsigned long, unsigned long *, unsigned long *,
 		      unsigned long *, unsigned long *, unsigned long *);
 
+extern void xor_niagara_2(unsigned long, unsigned long *, unsigned long *);
+extern void xor_niagara_3(unsigned long, unsigned long *, unsigned long *,
+			  unsigned long *);
+extern void xor_niagara_4(unsigned long, unsigned long *, unsigned long *,
+			  unsigned long *, unsigned long *);
+extern void xor_niagara_5(unsigned long, unsigned long *, unsigned long *,
+			  unsigned long *, unsigned long *, unsigned long *);
+
 /* Per-CPU information table */
 EXPORT_PER_CPU_SYMBOL(__cpu_data);
 
@@ -170,11 +175,6 @@ EXPORT_SYMBOL(set_bit);
 EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(change_bit);
 
-/* Bit searching */
-EXPORT_SYMBOL(find_next_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
-EXPORT_SYMBOL(find_next_zero_le_bit);
-
 EXPORT_SYMBOL(ivector_table);
 EXPORT_SYMBOL(enable_irq);
 EXPORT_SYMBOL(disable_irq);
@@ -241,10 +241,6 @@ EXPORT_SYMBOL(verify_compat_iovec);
 #endif
 
 EXPORT_SYMBOL(dump_fpu);
-EXPORT_SYMBOL(pte_alloc_one_kernel);
-#ifndef CONFIG_SMP
-EXPORT_SYMBOL(pgt_quicklists);
-#endif
 EXPORT_SYMBOL(put_fs_struct);
 
 /* math-emu wants this */
@@ -278,18 +274,9 @@ EXPORT_SYMBOL(__prom_getsibling);
 
 /* sparc library symbols */
 EXPORT_SYMBOL(strlen);
-EXPORT_SYMBOL(strnlen);
 EXPORT_SYMBOL(__strlen_user);
 EXPORT_SYMBOL(__strnlen_user);
-EXPORT_SYMBOL(strcpy);
-EXPORT_SYMBOL(strncpy);
-EXPORT_SYMBOL(strcat);
-EXPORT_SYMBOL(strncat);
-EXPORT_SYMBOL(strcmp);
-EXPORT_SYMBOL(strchr);
-EXPORT_SYMBOL(strrchr);
 EXPORT_SYMBOL(strpbrk);
-EXPORT_SYMBOL(strstr);
 
 #ifdef CONFIG_SOLARIS_EMUL_MODULE
 EXPORT_SYMBOL(linux_sparc_syscall);
@@ -323,7 +310,6 @@ EXPORT_SYMBOL(__memscan_zero);
 EXPORT_SYMBOL(__memscan_generic);
 EXPORT_SYMBOL(__memcmp);
 EXPORT_SYMBOL(__memset);
-EXPORT_SYMBOL(memchr);
 
 EXPORT_SYMBOL(csum_partial);
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
@@ -339,14 +325,10 @@ EXPORT_SYMBOL(copy_to_user_fixup);
 EXPORT_SYMBOL(copy_from_user_fixup);
 EXPORT_SYMBOL(copy_in_user_fixup);
 EXPORT_SYMBOL(__strncpy_from_user);
-EXPORT_SYMBOL(__bzero_noasi);
+EXPORT_SYMBOL(__clear_user);
 
 /* Various address conversion macros use this. */
-EXPORT_SYMBOL(phys_base);
-EXPORT_SYMBOL(pfn_base);
 EXPORT_SYMBOL(sparc64_valid_addr_bitmap);
-EXPORT_SYMBOL(page_to_pfn);
-EXPORT_SYMBOL(pfn_to_page);
 
 /* No version information on this, heavily used in inline asm,
  * and will always be 'void __ret_efault(void)'.
@@ -392,4 +374,9 @@ EXPORT_SYMBOL(xor_vis_3);
 EXPORT_SYMBOL(xor_vis_4);
 EXPORT_SYMBOL(xor_vis_5);
 
+EXPORT_SYMBOL(xor_niagara_2);
+EXPORT_SYMBOL(xor_niagara_3);
+EXPORT_SYMBOL(xor_niagara_4);
+EXPORT_SYMBOL(xor_niagara_5);
+
 EXPORT_SYMBOL(prom_palette);
diff --git a/arch/sparc64/kernel/sun4v_ivec.S b/arch/sparc64/kernel/sun4v_ivec.S
new file mode 100644
index 000000000000..b49a68bdda43
--- /dev/null
+++ b/arch/sparc64/kernel/sun4v_ivec.S
@@ -0,0 +1,334 @@
+/* sun4v_ivec.S: Sun4v interrupt vector handling.
+ *
+ * Copyright (C) 2006 <davem@davemloft.net>
+ */
+
+#include <asm/cpudata.h>
+#include <asm/intr_queue.h>
+
+	.text
+	.align	32
+
+sun4v_cpu_mondo:
+	/* Head offset in %g2, tail offset in %g4.
+	 * If they are the same, no work.
+	 */
+	mov	INTRQ_CPU_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_CPU_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_cpu_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get CPU mondo queue base phys address into %g7.  */
+	ldx	[%g3 + TRAP_PER_CPU_CPU_MONDO_PA], %g7
+
+	/* Now get the cross-call arguments and handler PC, same
+	 * layout as sun4u:
+	 *
+	 * 1st 64-bit word: low half is 32-bit PC, put into %g3 and jmpl to it
+	 *                  high half is context arg to MMU flushes, into %g5
+	 * 2nd 64-bit word: 64-bit arg, load into %g1
+	 * 3rd 64-bit word: 64-bit arg, load into %g7
+	 */
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g3
+	add	%g2, 0x8, %g2
+	srlx	%g3, 32, %g5
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g1
+	add	%g2, 0x8, %g2
+	srl	%g3, 0, %g3
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g7
+	add	%g2, 0x40 - 0x8 - 0x8, %g2
+
+	/* Update queue head pointer.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_CPU_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	jmpl	%g3, %g0
+	 nop
+
+sun4v_cpu_mondo_queue_empty:
+	retry
+
+sun4v_dev_mondo:
+	/* Head offset in %g2, tail offset in %g4.  */
+	mov	INTRQ_DEVICE_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_DEVICE_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_dev_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get DEV mondo queue base phys address into %g5.  */
+	ldx	[%g3 + TRAP_PER_CPU_DEV_MONDO_PA], %g5
+
+	/* Load IVEC into %g3.  */
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	add	%g2, 0x40, %g2
+
+	/* XXX There can be a full 64-byte block of data here.
+	 * XXX This is how we can get at MSI vector data.
+	 * XXX Current we do not capture this, but when we do we'll
+	 * XXX need to add a 64-byte storage area in the struct ino_bucket
+	 * XXX or the struct irq_desc.
+	 */
+
+	/* Update queue head pointer, this frees up some registers.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_DEVICE_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	/* Get &__irq_work[smp_processor_id()] into %g1.  */
+	TRAP_LOAD_IRQ_WORK(%g1, %g4)
+
+	/* Get &ivector_table[IVEC] into %g4.  */
+	sethi	%hi(ivector_table), %g4
+	sllx	%g3, 5, %g3
+	or	%g4, %lo(ivector_table), %g4
+	add	%g4, %g3, %g4
+
+	/* Load IRQ %pil into %g5.  */
+	ldub	[%g4 + 0x04], %g5
+
+	/* Insert ivector_table[] entry into __irq_work[] queue.  */
+	sllx	%g5, 2, %g3
+	lduw	[%g1 + %g3], %g2	/* g2 = irq_work(cpu, pil) */
+	stw	%g2, [%g4 + 0x00]	/* bucket->irq_chain = g2 */
+	stw	%g4, [%g1 + %g3]	/* irq_work(cpu, pil) = bucket */
+
+	/* Signal the interrupt by setting (1 << pil) in %softint.  */
+	mov	1, %g2
+	sllx	%g2, %g5, %g2
+	wr	%g2, 0x0, %set_softint
+
+sun4v_dev_mondo_queue_empty:
+	retry
+
+sun4v_res_mondo:
+	/* Head offset in %g2, tail offset in %g4.  */
+	mov	INTRQ_RESUM_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_RESUM_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_res_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get RES mondo queue base phys address into %g5.  */
+	ldx	[%g3 + TRAP_PER_CPU_RESUM_MONDO_PA], %g5
+
+	/* Get RES kernel buffer base phys address into %g7.  */
+	ldx	[%g3 + TRAP_PER_CPU_RESUM_KBUF_PA], %g7
+
+	/* If the first word is non-zero, queue is full.  */
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g1
+	brnz,pn	%g1, sun4v_res_mondo_queue_full
+	 nop
+
+	/* Remember this entry's offset in %g1.  */
+	mov	%g2, %g1
+
+	/* Copy 64-byte queue entry into kernel buffer.  */
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+
+	/* Update queue head pointer.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_RESUM_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	/* Disable interrupts and save register state so we can call
+	 * C code.  The etrap handling will leave %g4 in %l4 for us
+	 * when it's done.
+	 */
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	mov	%g1, %g4
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	/* Log the event.  */
+	add	%sp, PTREGS_OFF, %o0
+	call	sun4v_resum_error
+	 mov	%l4, %o1
+
+	/* Return from trap.  */
+	ba,pt	%xcc, rtrap_irq
+	 nop
+
+sun4v_res_mondo_queue_empty:
+	retry
+
+sun4v_res_mondo_queue_full:
+	/* The queue is full, consolidate our damage by setting
+	 * the head equal to the tail.  We'll just trap again otherwise.
+	 * Call C code to log the event.
+	 */
+	mov	INTRQ_RESUM_MONDO_HEAD, %g2
+	stxa	%g4, [%g2] ASI_QUEUE
+	membar	#Sync
+
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	call	sun4v_resum_overflow
+	 add	%sp, PTREGS_OFF, %o0
+
+	ba,pt	%xcc, rtrap_irq
+	 nop
+
+sun4v_nonres_mondo:
+	/* Head offset in %g2, tail offset in %g4.  */
+	mov	INTRQ_NONRESUM_MONDO_HEAD, %g2
+	ldxa	[%g2] ASI_QUEUE, %g2
+	mov	INTRQ_NONRESUM_MONDO_TAIL, %g4
+	ldxa	[%g4] ASI_QUEUE, %g4
+	cmp	%g2, %g4
+	be,pn	%xcc, sun4v_nonres_mondo_queue_empty
+	 nop
+
+	/* Get &trap_block[smp_processor_id()] into %g3.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g3
+	sub	%g3, TRAP_PER_CPU_FAULT_INFO, %g3
+
+	/* Get RES mondo queue base phys address into %g5.  */
+	ldx	[%g3 + TRAP_PER_CPU_NONRESUM_MONDO_PA], %g5
+
+	/* Get RES kernel buffer base phys address into %g7.  */
+	ldx	[%g3 + TRAP_PER_CPU_NONRESUM_KBUF_PA], %g7
+
+	/* If the first word is non-zero, queue is full.  */
+	ldxa	[%g7 + %g2] ASI_PHYS_USE_EC, %g1
+	brnz,pn	%g1, sun4v_nonres_mondo_queue_full
+	 nop
+
+	/* Remember this entry's offset in %g1.  */
+	mov	%g2, %g1
+
+	/* Copy 64-byte queue entry into kernel buffer.  */
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+	ldxa	[%g5 + %g2] ASI_PHYS_USE_EC, %g3
+	stxa	%g3, [%g7 + %g2] ASI_PHYS_USE_EC
+	add	%g2, 0x08, %g2
+
+	/* Update queue head pointer.  */
+	sethi	%hi(8192 - 1), %g4
+	or	%g4, %lo(8192 - 1), %g4
+	and	%g2, %g4, %g2
+
+	mov	INTRQ_NONRESUM_MONDO_HEAD, %g4
+	stxa	%g2, [%g4] ASI_QUEUE
+	membar	#Sync
+
+	/* Disable interrupts and save register state so we can call
+	 * C code.  The etrap handling will leave %g4 in %l4 for us
+	 * when it's done.
+	 */
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	mov	%g1, %g4
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	/* Log the event.  */
+	add	%sp, PTREGS_OFF, %o0
+	call	sun4v_nonresum_error
+	 mov	%l4, %o1
+
+	/* Return from trap.  */
+	ba,pt	%xcc, rtrap_irq
+	 nop
+
+sun4v_nonres_mondo_queue_empty:
+	retry
+
+sun4v_nonres_mondo_queue_full:
+	/* The queue is full, consolidate our damage by setting
+	 * the head equal to the tail.  We'll just trap again otherwise.
+	 * Call C code to log the event.
+	 */
+	mov	INTRQ_NONRESUM_MONDO_HEAD, %g2
+	stxa	%g4, [%g2] ASI_QUEUE
+	membar	#Sync
+
+	rdpr	%pil, %g2
+	wrpr	%g0, 15, %pil
+	ba,pt	%xcc, etrap_irq
+	 rd	%pc, %g7
+
+	call	sun4v_nonresum_overflow
+	 add	%sp, PTREGS_OFF, %o0
+
+	ba,pt	%xcc, rtrap_irq
+	 nop
diff --git a/arch/sparc64/kernel/sun4v_tlb_miss.S b/arch/sparc64/kernel/sun4v_tlb_miss.S
new file mode 100644
index 000000000000..b731881224e8
--- /dev/null
+++ b/arch/sparc64/kernel/sun4v_tlb_miss.S
@@ -0,0 +1,426 @@
+/* sun4v_tlb_miss.S: Sun4v TLB miss handlers.
+ *
+ * Copyright (C) 2006 <davem@davemloft.net>
+ */
+
+	.text
+	.align	32
+
+	/* Load ITLB fault information into VADDR and CTX, using BASE.  */
+#define LOAD_ITLB_INFO(BASE, VADDR, CTX) \
+	ldx	[BASE + HV_FAULT_I_ADDR_OFFSET], VADDR; \
+	ldx	[BASE + HV_FAULT_I_CTX_OFFSET], CTX;
+
+	/* Load DTLB fault information into VADDR and CTX, using BASE.  */
+#define LOAD_DTLB_INFO(BASE, VADDR, CTX) \
+	ldx	[BASE + HV_FAULT_D_ADDR_OFFSET], VADDR; \
+	ldx	[BASE + HV_FAULT_D_CTX_OFFSET], CTX;
+
+	/* DEST = (VADDR >> 22)
+	 *
+	 * Branch to ZERO_CTX_LABEL if context is zero.
+	 */
+#define	COMPUTE_TAG_TARGET(DEST, VADDR, CTX, ZERO_CTX_LABEL) \
+	srlx	VADDR, 22, DEST; \
+	brz,pn	CTX, ZERO_CTX_LABEL; \
+	 nop;
+
+	/* Create TSB pointer.  This is something like:
+	 *
+	 * index_mask = (512 << (tsb_reg & 0x7UL)) - 1UL;
+	 * tsb_base = tsb_reg & ~0x7UL;
+	 * tsb_index = ((vaddr >> HASH_SHIFT) & tsb_mask);
+	 * tsb_ptr = tsb_base + (tsb_index * 16);
+	 */
+#define COMPUTE_TSB_PTR(TSB_PTR, VADDR, HASH_SHIFT, TMP1, TMP2) \
+	and	TSB_PTR, 0x7, TMP1;			\
+	mov	512, TMP2;				\
+	andn	TSB_PTR, 0x7, TSB_PTR;			\
+	sllx	TMP2, TMP1, TMP2;			\
+	srlx	VADDR, HASH_SHIFT, TMP1;		\
+	sub	TMP2, 1, TMP2;				\
+	and	TMP1, TMP2, TMP1;			\
+	sllx	TMP1, 4, TMP1;				\
+	add	TSB_PTR, TMP1, TSB_PTR;
+
+sun4v_itlb_miss:
+	/* Load MMU Miss base into %g2.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	
+	/* Load UTSB reg into %g1.  */
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+
+	LOAD_ITLB_INFO(%g2, %g4, %g5)
+	COMPUTE_TAG_TARGET(%g6, %g4, %g5, kvmap_itlb_4v)
+	COMPUTE_TSB_PTR(%g1, %g4, PAGE_SHIFT, %g3, %g7)
+
+	/* Load TSB tag/pte into %g2/%g3 and compare the tag.  */
+	ldda	[%g1] ASI_QUAD_LDD_PHYS_4V, %g2
+	cmp	%g2, %g6
+	bne,a,pn %xcc, tsb_miss_page_table_walk
+	 mov	FAULT_CODE_ITLB, %g3
+	andcc	%g3, _PAGE_EXEC_4V, %g0
+	be,a,pn	%xcc, tsb_do_fault
+	 mov	FAULT_CODE_ITLB, %g3
+
+	/* We have a valid entry, make hypervisor call to load
+	 * I-TLB and return from trap.
+	 *
+	 * %g3:	PTE
+	 * %g4:	vaddr
+	 */
+sun4v_itlb_load:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	mov	%o0, %g1		! save %o0
+	mov	%o1, %g2		! save %o1
+	mov	%o2, %g5		! save %o2
+	mov	%o3, %g7		! save %o3
+	mov	%g4, %o0		! vaddr
+	ldx	[%g6 + HV_FAULT_I_CTX_OFFSET], %o1	! ctx
+	mov	%g3, %o2		! PTE
+	mov	HV_MMU_IMMU, %o3	! flags
+	ta	HV_MMU_MAP_ADDR_TRAP
+	brnz,pn	%o0, sun4v_itlb_error
+	 mov	%g2, %o1		! restore %o1
+	mov	%g1, %o0		! restore %o0
+	mov	%g5, %o2		! restore %o2
+	mov	%g7, %o3		! restore %o3
+
+	retry
+
+sun4v_dtlb_miss:
+	/* Load MMU Miss base into %g2.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	
+	/* Load UTSB reg into %g1.  */
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+
+	LOAD_DTLB_INFO(%g2, %g4, %g5)
+	COMPUTE_TAG_TARGET(%g6, %g4, %g5, kvmap_dtlb_4v)
+	COMPUTE_TSB_PTR(%g1, %g4, PAGE_SHIFT, %g3, %g7)
+
+	/* Load TSB tag/pte into %g2/%g3 and compare the tag.  */
+	ldda	[%g1] ASI_QUAD_LDD_PHYS_4V, %g2
+	cmp	%g2, %g6
+	bne,a,pn %xcc, tsb_miss_page_table_walk
+	 mov	FAULT_CODE_DTLB, %g3
+
+	/* We have a valid entry, make hypervisor call to load
+	 * D-TLB and return from trap.
+	 *
+	 * %g3:	PTE
+	 * %g4:	vaddr
+	 */
+sun4v_dtlb_load:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	mov	%o0, %g1		! save %o0
+	mov	%o1, %g2		! save %o1
+	mov	%o2, %g5		! save %o2
+	mov	%o3, %g7		! save %o3
+	mov	%g4, %o0		! vaddr
+	ldx	[%g6 + HV_FAULT_D_CTX_OFFSET], %o1	! ctx
+	mov	%g3, %o2		! PTE
+	mov	HV_MMU_DMMU, %o3	! flags
+	ta	HV_MMU_MAP_ADDR_TRAP
+	brnz,pn	%o0, sun4v_dtlb_error
+	 mov	%g2, %o1		! restore %o1
+	mov	%g1, %o0		! restore %o0
+	mov	%g5, %o2		! restore %o2
+	mov	%g7, %o3		! restore %o3
+
+	retry
+
+sun4v_dtlb_prot:
+	SET_GL(1)
+
+	/* Load MMU Miss base into %g5.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g5
+	
+	ldx	[%g5 + HV_FAULT_D_ADDR_OFFSET], %g5
+	rdpr	%tl, %g1
+	cmp	%g1, 1
+	bgu,pn	%xcc, winfix_trampoline
+	 nop
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB | FAULT_CODE_WRITE, %g4
+
+	/* Called from trap table:
+	 * %g4:	vaddr
+	 * %g5:	context
+	 * %g6: TAG TARGET
+	 */
+sun4v_itsb_miss:
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+	brz,pn	%g5, kvmap_itlb_4v
+	 mov	FAULT_CODE_ITLB, %g3
+	ba,a,pt	%xcc, sun4v_tsb_miss_common
+
+	/* Called from trap table:
+	 * %g4:	vaddr
+	 * %g5:	context
+	 * %g6: TAG TARGET
+	 */
+sun4v_dtsb_miss:
+	mov	SCRATCHPAD_UTSBREG1, %g1
+	ldxa	[%g1] ASI_SCRATCHPAD, %g1
+	brz,pn	%g5, kvmap_dtlb_4v
+	 mov	FAULT_CODE_DTLB, %g3
+
+	/* fallthrough */
+
+sun4v_tsb_miss_common:
+	COMPUTE_TSB_PTR(%g1, %g4, PAGE_SHIFT, %g5, %g7)
+
+	sub	%g2, TRAP_PER_CPU_FAULT_INFO, %g2
+
+#ifdef CONFIG_HUGETLB_PAGE
+	mov	SCRATCHPAD_UTSBREG2, %g5
+	ldxa	[%g5] ASI_SCRATCHPAD, %g5
+	cmp	%g5, -1
+	be,pt	%xcc, 80f
+	 nop
+	COMPUTE_TSB_PTR(%g5, %g4, HPAGE_SHIFT, %g2, %g7)
+
+	/* That clobbered %g2, reload it.  */
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	sub	%g2, TRAP_PER_CPU_FAULT_INFO, %g2
+
+80:	stx	%g5, [%g2 + TRAP_PER_CPU_TSB_HUGE_TEMP]
+#endif
+
+	ba,pt	%xcc, tsb_miss_page_table_walk_sun4v_fastpath
+	 ldx	[%g2 + TRAP_PER_CPU_PGD_PADDR], %g7
+
+sun4v_itlb_error:
+	sethi	%hi(sun4v_err_itlb_vaddr), %g1
+	stx	%g4, [%g1 + %lo(sun4v_err_itlb_vaddr)]
+	sethi	%hi(sun4v_err_itlb_ctx), %g1
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	ldx	[%g6 + HV_FAULT_I_CTX_OFFSET], %o1
+	stx	%o1, [%g1 + %lo(sun4v_err_itlb_ctx)]
+	sethi	%hi(sun4v_err_itlb_pte), %g1
+	stx	%g3, [%g1 + %lo(sun4v_err_itlb_pte)]
+	sethi	%hi(sun4v_err_itlb_error), %g1
+	stx	%o0, [%g1 + %lo(sun4v_err_itlb_error)]
+
+	rdpr	%tl, %g4
+	cmp	%g4, 1
+	ble,pt	%icc, 1f
+	 sethi	%hi(2f), %g7
+	ba,pt	%xcc, etraptl1
+	 or	%g7, %lo(2f), %g7
+
+1:	ba,pt	%xcc, etrap
+2:	 or	%g7, %lo(2b), %g7
+	call	sun4v_itlb_error_report
+	 add	%sp, PTREGS_OFF, %o0
+
+	/* NOTREACHED */
+
+sun4v_dtlb_error:
+	sethi	%hi(sun4v_err_dtlb_vaddr), %g1
+	stx	%g4, [%g1 + %lo(sun4v_err_dtlb_vaddr)]
+	sethi	%hi(sun4v_err_dtlb_ctx), %g1
+	ldxa	[%g0] ASI_SCRATCHPAD, %g6
+	ldx	[%g6 + HV_FAULT_D_CTX_OFFSET], %o1
+	stx	%o1, [%g1 + %lo(sun4v_err_dtlb_ctx)]
+	sethi	%hi(sun4v_err_dtlb_pte), %g1
+	stx	%g3, [%g1 + %lo(sun4v_err_dtlb_pte)]
+	sethi	%hi(sun4v_err_dtlb_error), %g1
+	stx	%o0, [%g1 + %lo(sun4v_err_dtlb_error)]
+
+	rdpr	%tl, %g4
+	cmp	%g4, 1
+	ble,pt	%icc, 1f
+	 sethi	%hi(2f), %g7
+	ba,pt	%xcc, etraptl1
+	 or	%g7, %lo(2f), %g7
+
+1:	ba,pt	%xcc, etrap
+2:	 or	%g7, %lo(2b), %g7
+	call	sun4v_dtlb_error_report
+	 add	%sp, PTREGS_OFF, %o0
+
+	/* NOTREACHED */
+
+	/* Instruction Access Exception, tl0. */
+sun4v_iacc:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_I_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_I_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_I_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_insn_access_exception
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Instruction Access Exception, tl1. */
+sun4v_iacc_tl1:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_I_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_I_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_I_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etraptl1
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_insn_access_exception_tl1
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Data Access Exception, tl0. */
+sun4v_dacc:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_data_access_exception
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Data Access Exception, tl1. */
+sun4v_dacc_tl1:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etraptl1
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_data_access_exception_tl1
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Memory Address Unaligned.  */
+sun4v_mna:
+	/* Window fixup? */
+	rdpr	%tl, %g2
+	cmp	%g2, 1
+	ble,pt	%icc, 1f
+	 nop
+
+	SET_GL(1)
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g5
+	mov	HV_FAULT_TYPE_UNALIGNED, %g3
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g4
+	sllx	%g3, 16, %g3
+	or	%g4, %g3, %g4
+	ba,pt	%xcc, winfix_mna
+	 rdpr	%tpc, %g3
+	/* not reached */
+
+1:	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	mov	HV_FAULT_TYPE_UNALIGNED, %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	sun4v_do_mna
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Privileged Action.  */
+sun4v_privact:
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	call	do_privact
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Unaligned ldd float, tl0. */
+sun4v_lddfmna:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	handle_lddfmna
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	/* Unaligned std float, tl0. */
+sun4v_stdfmna:
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2
+	ldx	[%g2 + HV_FAULT_D_TYPE_OFFSET], %g3
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5
+	sllx	%g3, 16, %g3
+	or	%g5, %g3, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o1
+	mov	%l5, %o2
+	call	handle_stdfmna
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define SUN4V_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	sun4v_patch_tlb_handlers
+	.type	sun4v_patch_tlb_handlers,#function
+sun4v_patch_tlb_handlers:
+	SUN4V_DO_PATCH(tl0_iamiss, sun4v_itlb_miss)
+	SUN4V_DO_PATCH(tl1_iamiss, sun4v_itlb_miss)
+	SUN4V_DO_PATCH(tl0_damiss, sun4v_dtlb_miss)
+	SUN4V_DO_PATCH(tl1_damiss, sun4v_dtlb_miss)
+	SUN4V_DO_PATCH(tl0_daprot, sun4v_dtlb_prot)
+	SUN4V_DO_PATCH(tl1_daprot, sun4v_dtlb_prot)
+	SUN4V_DO_PATCH(tl0_iax, sun4v_iacc)
+	SUN4V_DO_PATCH(tl1_iax, sun4v_iacc_tl1)
+	SUN4V_DO_PATCH(tl0_dax, sun4v_dacc)
+	SUN4V_DO_PATCH(tl1_dax, sun4v_dacc_tl1)
+	SUN4V_DO_PATCH(tl0_mna, sun4v_mna)
+	SUN4V_DO_PATCH(tl1_mna, sun4v_mna)
+	SUN4V_DO_PATCH(tl0_lddfmna, sun4v_lddfmna)
+	SUN4V_DO_PATCH(tl0_stdfmna, sun4v_stdfmna)
+	SUN4V_DO_PATCH(tl0_privact, sun4v_privact)
+	retl
+	 nop
+	.size	sun4v_patch_tlb_handlers,.-sun4v_patch_tlb_handlers
diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c
index 5f8c822a2b4a..7a869138c37f 100644
--- a/arch/sparc64/kernel/sys_sparc.c
+++ b/arch/sparc64/kernel/sys_sparc.c
@@ -25,25 +25,93 @@
 #include <linux/syscalls.h>
 #include <linux/ipc.h>
 #include <linux/personality.h>
+#include <linux/random.h>
 
 #include <asm/uaccess.h>
 #include <asm/ipc.h>
 #include <asm/utrap.h>
 #include <asm/perfctr.h>
+#include <asm/a.out.h>
 
 /* #define DEBUG_UNIMP_SYSCALL */
 
-/* XXX Make this per-binary type, this way we can detect the type of
- * XXX a binary.  Every Sparc executable calls this very early on.
- */
 asmlinkage unsigned long sys_getpagesize(void)
 {
 	return PAGE_SIZE;
 }
 
-#define COLOUR_ALIGN(addr,pgoff)		\
-	((((addr)+SHMLBA-1)&~(SHMLBA-1)) +	\
-	 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
+#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL))
+#define VA_EXCLUDE_END   (0xfffff80000000000UL + (1UL << 32UL))
+
+/* Does addr --> addr+len fall within 4GB of the VA-space hole or
+ * overflow past the end of the 64-bit address space?
+ */
+static inline int invalid_64bit_range(unsigned long addr, unsigned long len)
+{
+	unsigned long va_exclude_start, va_exclude_end;
+
+	va_exclude_start = VA_EXCLUDE_START;
+	va_exclude_end   = VA_EXCLUDE_END;
+
+	if (unlikely(len >= va_exclude_start))
+		return 1;
+
+	if (unlikely((addr + len) < addr))
+		return 1;
+
+	if (unlikely((addr >= va_exclude_start && addr < va_exclude_end) ||
+		     ((addr + len) >= va_exclude_start &&
+		      (addr + len) < va_exclude_end)))
+		return 1;
+
+	return 0;
+}
+
+/* Does start,end straddle the VA-space hole?  */
+static inline int straddles_64bit_va_hole(unsigned long start, unsigned long end)
+{
+	unsigned long va_exclude_start, va_exclude_end;
+
+	va_exclude_start = VA_EXCLUDE_START;
+	va_exclude_end   = VA_EXCLUDE_END;
+
+	if (likely(start < va_exclude_start && end < va_exclude_start))
+		return 0;
+
+	if (likely(start >= va_exclude_end && end >= va_exclude_end))
+		return 0;
+
+	return 1;
+}
+
+/* These functions differ from the default implementations in
+ * mm/mmap.c in two ways:
+ *
+ * 1) For file backed MAP_SHARED mmap()'s we D-cache color align,
+ *    for fixed such mappings we just validate what the user gave us.
+ * 2) For 64-bit tasks we avoid mapping anything within 4GB of
+ *    the spitfire/niagara VA-hole.
+ */
+
+static inline unsigned long COLOUR_ALIGN(unsigned long addr,
+					 unsigned long pgoff)
+{
+	unsigned long base = (addr+SHMLBA-1)&~(SHMLBA-1);
+	unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1);
+
+	return base + off;
+}
+
+static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
+					      unsigned long pgoff)
+{
+	unsigned long base = addr & ~(SHMLBA-1);
+	unsigned long off = (pgoff<<PAGE_SHIFT) & (SHMLBA-1);
+
+	if (base + off <= addr)
+		return base + off;
+	return base - off;
+}
 
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
@@ -64,8 +132,8 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 	}
 
 	if (test_thread_flag(TIF_32BIT))
-		task_size = 0xf0000000UL;
-	if (len > task_size || len > -PAGE_OFFSET)
+		task_size = STACK_TOP32;
+	if (unlikely(len > task_size || len >= VA_EXCLUDE_START))
 		return -ENOMEM;
 
 	do_color_align = 0;
@@ -84,11 +152,12 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 			return addr;
 	}
 
-	if (len <= mm->cached_hole_size) {
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
 	        mm->cached_hole_size = 0;
-		mm->free_area_cache = TASK_UNMAPPED_BASE;
 	}
-	start_addr = addr = mm->free_area_cache;
 
 	task_size -= len;
 
@@ -100,11 +169,12 @@ full_search:
 
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (addr < PAGE_OFFSET && -PAGE_OFFSET - len < addr) {
-			addr = PAGE_OFFSET;
-			vma = find_vma(mm, PAGE_OFFSET);
+		if (addr < VA_EXCLUDE_START &&
+		    (addr + len) >= VA_EXCLUDE_START) {
+			addr = VA_EXCLUDE_END;
+			vma = find_vma(mm, VA_EXCLUDE_END);
 		}
-		if (task_size < addr) {
+		if (unlikely(task_size < addr)) {
 			if (start_addr != TASK_UNMAPPED_BASE) {
 				start_addr = addr = TASK_UNMAPPED_BASE;
 				mm->cached_hole_size = 0;
@@ -112,7 +182,7 @@ full_search:
 			}
 			return -ENOMEM;
 		}
-		if (!vma || addr + len <= vma->vm_start) {
+		if (likely(!vma || addr + len <= vma->vm_start)) {
 			/*
 			 * Remember the place where we stopped the search:
 			 */
@@ -128,6 +198,121 @@ full_search:
 	}
 }
 
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+			  const unsigned long len, const unsigned long pgoff,
+			  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long task_size = STACK_TOP32;
+	unsigned long addr = addr0;
+	int do_color_align;
+
+	/* This should only ever run for 32-bit processes.  */
+	BUG_ON(!test_thread_flag(TIF_32BIT));
+
+	if (flags & MAP_FIXED) {
+		/* We do not accept a shared mapping if it would violate
+		 * cache aliasing constraints.
+		 */
+		if ((flags & MAP_SHARED) &&
+		    ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (unlikely(len > task_size))
+		return -ENOMEM;
+
+	do_color_align = 0;
+	if (filp || (flags & MAP_SHARED))
+		do_color_align = 1;
+
+	/* requesting a specific address */
+	if (addr) {
+		if (do_color_align)
+			addr = COLOUR_ALIGN(addr, pgoff);
+		else
+			addr = PAGE_ALIGN(addr);
+
+		vma = find_vma(mm, addr);
+		if (task_size - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache;
+	if (do_color_align) {
+		unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
+
+		addr = base + len;
+	}
+
+	/* make sure it can fit in the remaining address space */
+	if (likely(addr > len)) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+		}
+	}
+
+	if (unlikely(mm->mmap_base < len))
+		goto bottomup;
+
+	addr = mm->mmap_base-len;
+	if (do_color_align)
+		addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (likely(!vma || addr+len <= vma->vm_start)) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+		}
+
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start-len;
+		if (do_color_align)
+			addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+	} while (likely(len < vma->vm_start));
+
+bottomup:
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
+
 /* Try to align mapping such that we align it as much as possible. */
 unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
 {
@@ -171,15 +356,57 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, u
 	return addr;
 }
 
+/* Essentially the same as PowerPC... */
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+	unsigned long random_factor = 0UL;
+
+	if (current->flags & PF_RANDOMIZE) {
+		random_factor = get_random_int();
+		if (test_thread_flag(TIF_32BIT))
+			random_factor &= ((1 * 1024 * 1024) - 1);
+		else
+			random_factor = ((random_factor << PAGE_SHIFT) &
+					 0xffffffffUL);
+	}
+
+	/*
+	 * Fall back to the standard layout if the personality
+	 * bit is set, or if the expected stack growth is unlimited:
+	 */
+	if (!test_thread_flag(TIF_32BIT) ||
+	    (current->personality & ADDR_COMPAT_LAYOUT) ||
+	    current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY ||
+	    sysctl_legacy_va_layout) {
+		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+		mm->get_unmapped_area = arch_get_unmapped_area;
+		mm->unmap_area = arch_unmap_area;
+	} else {
+		/* We know it's 32-bit */
+		unsigned long task_size = STACK_TOP32;
+		unsigned long gap;
+
+		gap = current->signal->rlim[RLIMIT_STACK].rlim_cur;
+		if (gap < 128 * 1024 * 1024)
+			gap = 128 * 1024 * 1024;
+		if (gap > (task_size / 6 * 5))
+			gap = (task_size / 6 * 5);
+
+		mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor);
+		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+		mm->unmap_area = arch_unmap_area_topdown;
+	}
+}
+
 asmlinkage unsigned long sparc_brk(unsigned long brk)
 {
 	/* People could try to be nasty and use ta 0x6d in 32bit programs */
-	if (test_thread_flag(TIF_32BIT) &&
-	    brk >= 0xf0000000UL)
+	if (test_thread_flag(TIF_32BIT) && brk >= STACK_TOP32)
 		return current->mm->brk;
 
-	if ((current->mm->brk & PAGE_OFFSET) != (brk & PAGE_OFFSET))
+	if (unlikely(straddles_64bit_va_hole(current->mm->brk, brk)))
 		return current->mm->brk;
+
 	return sys_brk(brk);
 }
                                                                 
@@ -340,13 +567,16 @@ asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len,
 	retval = -EINVAL;
 
 	if (test_thread_flag(TIF_32BIT)) {
-		if (len > 0xf0000000UL ||
-		    ((flags & MAP_FIXED) && addr > 0xf0000000UL - len))
+		if (len >= STACK_TOP32)
+			goto out_putf;
+
+		if ((flags & MAP_FIXED) && addr > STACK_TOP32 - len)
 			goto out_putf;
 	} else {
-		if (len > -PAGE_OFFSET ||
-		    ((flags & MAP_FIXED) &&
-		     addr < PAGE_OFFSET && addr + len > -PAGE_OFFSET))
+		if (len >= VA_EXCLUDE_START)
+			goto out_putf;
+
+		if ((flags & MAP_FIXED) && invalid_64bit_range(addr, len))
 			goto out_putf;
 	}
 
@@ -365,9 +595,9 @@ asmlinkage long sys64_munmap(unsigned long addr, size_t len)
 {
 	long ret;
 
-	if (len > -PAGE_OFFSET ||
-	    (addr < PAGE_OFFSET && addr + len > -PAGE_OFFSET))
+	if (invalid_64bit_range(addr, len))
 		return -EINVAL;
+
 	down_write(&current->mm->mmap_sem);
 	ret = do_munmap(current->mm, addr, len);
 	up_write(&current->mm->mmap_sem);
@@ -384,18 +614,19 @@ asmlinkage unsigned long sys64_mremap(unsigned long addr,
 {
 	struct vm_area_struct *vma;
 	unsigned long ret = -EINVAL;
+
 	if (test_thread_flag(TIF_32BIT))
 		goto out;
-	if (old_len > -PAGE_OFFSET || new_len > -PAGE_OFFSET)
+	if (unlikely(new_len >= VA_EXCLUDE_START))
 		goto out;
-	if (addr < PAGE_OFFSET && addr + old_len > -PAGE_OFFSET)
+	if (unlikely(invalid_64bit_range(addr, old_len)))
 		goto out;
+
 	down_write(&current->mm->mmap_sem);
 	if (flags & MREMAP_FIXED) {
-		if (new_addr < PAGE_OFFSET &&
-		    new_addr + new_len > -PAGE_OFFSET)
+		if (invalid_64bit_range(new_addr, new_len))
 			goto out_sem;
-	} else if (addr < PAGE_OFFSET && addr + new_len > -PAGE_OFFSET) {
+	} else if (invalid_64bit_range(addr, new_len)) {
 		unsigned long map_flags = 0;
 		struct file *file = NULL;
 
@@ -554,12 +785,10 @@ asmlinkage long sys_utrap_install(utrap_entry_t type,
 	}
 	if (!current_thread_info()->utraps) {
 		current_thread_info()->utraps =
-			kmalloc((UT_TRAP_INSTRUCTION_31+1)*sizeof(long), GFP_KERNEL);
+			kzalloc((UT_TRAP_INSTRUCTION_31+1)*sizeof(long), GFP_KERNEL);
 		if (!current_thread_info()->utraps)
 			return -ENOMEM;
 		current_thread_info()->utraps[0] = 1;
-		memset(current_thread_info()->utraps+1, 0,
-		       UT_TRAP_INSTRUCTION_31*sizeof(long));
 	} else {
 		if ((utrap_handler_t)current_thread_info()->utraps[type] != new_p &&
 		    current_thread_info()->utraps[0] > 1) {
diff --git a/arch/sparc64/kernel/sys_sparc32.c b/arch/sparc64/kernel/sys_sparc32.c
index 417727bd87ba..2e906bad56fa 100644
--- a/arch/sparc64/kernel/sys_sparc32.c
+++ b/arch/sparc64/kernel/sys_sparc32.c
@@ -19,7 +19,6 @@
 #include <linux/resource.h>
 #include <linux/times.h>
 #include <linux/utsname.h>
-#include <linux/timex.h>
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/sem.h>
@@ -62,6 +61,7 @@
 #include <asm/fpumacro.h>
 #include <asm/semaphore.h>
 #include <asm/mmu_context.h>
+#include <asm/a.out.h>
 
 asmlinkage long sys32_chown16(const char __user * filename, u16 user, u16 group)
 {
@@ -944,79 +944,6 @@ asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
 	return ret;
 }
 
-/* Handle adjtimex compatibility. */
-
-struct timex32 {
-	u32 modes;
-	s32 offset, freq, maxerror, esterror;
-	s32 status, constant, precision, tolerance;
-	struct compat_timeval time;
-	s32 tick;
-	s32 ppsfreq, jitter, shift, stabil;
-	s32 jitcnt, calcnt, errcnt, stbcnt;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-	s32  :32; s32  :32; s32  :32; s32  :32;
-};
-
-extern int do_adjtimex(struct timex *);
-
-asmlinkage long sys32_adjtimex(struct timex32 __user *utp)
-{
-	struct timex txc;
-	int ret;
-
-	memset(&txc, 0, sizeof(struct timex));
-
-	if (get_user(txc.modes, &utp->modes) ||
-	    __get_user(txc.offset, &utp->offset) ||
-	    __get_user(txc.freq, &utp->freq) ||
-	    __get_user(txc.maxerror, &utp->maxerror) ||
-	    __get_user(txc.esterror, &utp->esterror) ||
-	    __get_user(txc.status, &utp->status) ||
-	    __get_user(txc.constant, &utp->constant) ||
-	    __get_user(txc.precision, &utp->precision) ||
-	    __get_user(txc.tolerance, &utp->tolerance) ||
-	    __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	    __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	    __get_user(txc.tick, &utp->tick) ||
-	    __get_user(txc.ppsfreq, &utp->ppsfreq) ||
-	    __get_user(txc.jitter, &utp->jitter) ||
-	    __get_user(txc.shift, &utp->shift) ||
-	    __get_user(txc.stabil, &utp->stabil) ||
-	    __get_user(txc.jitcnt, &utp->jitcnt) ||
-	    __get_user(txc.calcnt, &utp->calcnt) ||
-	    __get_user(txc.errcnt, &utp->errcnt) ||
-	    __get_user(txc.stbcnt, &utp->stbcnt))
-		return -EFAULT;
-
-	ret = do_adjtimex(&txc);
-
-	if (put_user(txc.modes, &utp->modes) ||
-	    __put_user(txc.offset, &utp->offset) ||
-	    __put_user(txc.freq, &utp->freq) ||
-	    __put_user(txc.maxerror, &utp->maxerror) ||
-	    __put_user(txc.esterror, &utp->esterror) ||
-	    __put_user(txc.status, &utp->status) ||
-	    __put_user(txc.constant, &utp->constant) ||
-	    __put_user(txc.precision, &utp->precision) ||
-	    __put_user(txc.tolerance, &utp->tolerance) ||
-	    __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
-	    __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
-	    __put_user(txc.tick, &utp->tick) ||
-	    __put_user(txc.ppsfreq, &utp->ppsfreq) ||
-	    __put_user(txc.jitter, &utp->jitter) ||
-	    __put_user(txc.shift, &utp->shift) ||
-	    __put_user(txc.stabil, &utp->stabil) ||
-	    __put_user(txc.jitcnt, &utp->jitcnt) ||
-	    __put_user(txc.calcnt, &utp->calcnt) ||
-	    __put_user(txc.errcnt, &utp->errcnt) ||
-	    __put_user(txc.stbcnt, &utp->stbcnt))
-		ret = -EFAULT;
-
-	return ret;
-}
-
 /* This is just a version for 32-bit applications which does
  * not force O_LARGEFILE on.
  */
@@ -1039,15 +966,15 @@ asmlinkage unsigned long sys32_mremap(unsigned long addr,
 	unsigned long ret = -EINVAL;
 	unsigned long new_addr = __new_addr;
 
-	if (old_len > 0xf0000000UL || new_len > 0xf0000000UL)
+	if (old_len > STACK_TOP32 || new_len > STACK_TOP32)
 		goto out;
-	if (addr > 0xf0000000UL - old_len)
+	if (addr > STACK_TOP32 - old_len)
 		goto out;
 	down_write(&current->mm->mmap_sem);
 	if (flags & MREMAP_FIXED) {
-		if (new_addr > 0xf0000000UL - new_len)
+		if (new_addr > STACK_TOP32 - new_len)
 			goto out_sem;
-	} else if (addr > 0xf0000000UL - new_len) {
+	} else if (addr > STACK_TOP32 - new_len) {
 		unsigned long map_flags = 0;
 		struct file *file = NULL;
 
diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S
index c3adb7ac167d..3b250f2318fd 100644
--- a/arch/sparc64/kernel/systbls.S
+++ b/arch/sparc64/kernel/systbls.S
@@ -63,7 +63,7 @@ sys_call_table32:
 /*200*/	.word sys32_ssetmask, sys_sigsuspend, compat_sys_newlstat, sys_uselib, compat_sys_old_readdir
 	.word sys32_readahead, sys32_socketcall, sys32_syslog, sys32_lookup_dcookie, sys32_fadvise64
 /*210*/	.word sys32_fadvise64_64, sys32_tgkill, sys32_waitpid, sys_swapoff, sys32_sysinfo
-	.word sys32_ipc, sys32_sigreturn, sys_clone, sys32_ioprio_get, sys32_adjtimex
+	.word sys32_ipc, sys32_sigreturn, sys_clone, sys32_ioprio_get, compat_sys_adjtimex
 /*220*/	.word sys32_sigprocmask, sys_ni_syscall, sys32_delete_module, sys_ni_syscall, sys32_getpgid
 	.word sys32_bdflush, sys32_sysfs, sys_nis_syscall, sys32_setfsuid16, sys32_setfsgid16
 /*230*/	.word sys32_select, compat_sys_time, sys_nis_syscall, compat_sys_stime, compat_sys_statfs64
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index a22930d62adf..e55b5c6ece02 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -30,6 +30,8 @@
 #include <linux/cpufreq.h>
 #include <linux/percpu.h>
 #include <linux/profile.h>
+#include <linux/miscdevice.h>
+#include <linux/rtc.h>
 
 #include <asm/oplib.h>
 #include <asm/mostek.h>
@@ -45,6 +47,7 @@
 #include <asm/smp.h>
 #include <asm/sections.h>
 #include <asm/cpudata.h>
+#include <asm/uaccess.h>
 
 DEFINE_SPINLOCK(mostek_lock);
 DEFINE_SPINLOCK(rtc_lock);
@@ -193,16 +196,22 @@ struct sparc64_tick_ops *tick_ops __read_mostly = &tick_operations;
 
 static void stick_init_tick(unsigned long offset)
 {
-	tick_disable_protection();
-
-	/* Let the user get at STICK too. */
-	__asm__ __volatile__(
-	"	rd	%%asr24, %%g2\n"
-	"	andn	%%g2, %0, %%g2\n"
-	"	wr	%%g2, 0, %%asr24"
-	: /* no outputs */
-	: "r" (TICK_PRIV_BIT)
-	: "g1", "g2");
+	/* Writes to the %tick and %stick register are not
+	 * allowed on sun4v.  The Hypervisor controls that
+	 * bit, per-strand.
+	 */
+	if (tlb_type != hypervisor) {
+		tick_disable_protection();
+
+		/* Let the user get at STICK too. */
+		__asm__ __volatile__(
+		"	rd	%%asr24, %%g2\n"
+		"	andn	%%g2, %0, %%g2\n"
+		"	wr	%%g2, 0, %%asr24"
+		: /* no outputs */
+		: "r" (TICK_PRIV_BIT)
+		: "g1", "g2");
+	}
 
 	__asm__ __volatile__(
 	"	rd	%%asr24, %%g1\n"
@@ -632,23 +641,8 @@ static void __init set_system_time(void)
 		mon = MSTK_REG_MONTH(mregs);
 		year = MSTK_CVT_YEAR( MSTK_REG_YEAR(mregs) );
 	} else {
-		int i;
-
 		/* Dallas 12887 RTC chip. */
 
-		/* Stolen from arch/i386/kernel/time.c, see there for
-		 * credits and descriptive comments.
-		 */
-		for (i = 0; i < 1000000; i++) {
-			if (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)
-				break;
-			udelay(10);
-		}
-		for (i = 0; i < 1000000; i++) {
-			if (!(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
-				break;
-			udelay(10);
-		}
 		do {
 			sec  = CMOS_READ(RTC_SECONDS);
 			min  = CMOS_READ(RTC_MINUTES);
@@ -657,6 +651,7 @@ static void __init set_system_time(void)
 			mon  = CMOS_READ(RTC_MONTH);
 			year = CMOS_READ(RTC_YEAR);
 		} while (sec != CMOS_READ(RTC_SECONDS));
+
 		if (!(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
 			BCD_TO_BIN(sec);
 			BCD_TO_BIN(min);
@@ -683,6 +678,83 @@ static void __init set_system_time(void)
 	}
 }
 
+/* davem suggests we keep this within the 4M locked kernel image */
+static u32 starfire_get_time(void)
+{
+	static char obp_gettod[32];
+	static u32 unix_tod;
+
+	sprintf(obp_gettod, "h# %08x unix-gettod",
+		(unsigned int) (long) &unix_tod);
+	prom_feval(obp_gettod);
+
+	return unix_tod;
+}
+
+static int starfire_set_time(u32 val)
+{
+	/* Do nothing, time is set using the service processor
+	 * console on this platform.
+	 */
+	return 0;
+}
+
+static u32 hypervisor_get_time(void)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	register unsigned long arg1 asm("%o1");
+	int retries = 10000;
+
+retry:
+	func = HV_FAST_TOD_GET;
+	arg0 = 0;
+	arg1 = 0;
+	__asm__ __volatile__("ta	%6"
+			     : "=&r" (func), "=&r" (arg0), "=&r" (arg1)
+			     : "0" (func), "1" (arg0), "2" (arg1),
+			       "i" (HV_FAST_TRAP));
+	if (arg0 == HV_EOK)
+		return arg1;
+	if (arg0 == HV_EWOULDBLOCK) {
+		if (--retries > 0) {
+			udelay(100);
+			goto retry;
+		}
+		printk(KERN_WARNING "SUN4V: tod_get() timed out.\n");
+		return 0;
+	}
+	printk(KERN_WARNING "SUN4V: tod_get() not supported.\n");
+	return 0;
+}
+
+static int hypervisor_set_time(u32 secs)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	int retries = 10000;
+
+retry:
+	func = HV_FAST_TOD_SET;
+	arg0 = secs;
+	__asm__ __volatile__("ta	%4"
+			     : "=&r" (func), "=&r" (arg0)
+			     : "0" (func), "1" (arg0),
+			       "i" (HV_FAST_TRAP));
+	if (arg0 == HV_EOK)
+		return 0;
+	if (arg0 == HV_EWOULDBLOCK) {
+		if (--retries > 0) {
+			udelay(100);
+			goto retry;
+		}
+		printk(KERN_WARNING "SUN4V: tod_set() timed out.\n");
+		return -EAGAIN;
+	}
+	printk(KERN_WARNING "SUN4V: tod_set() not supported.\n");
+	return -EOPNOTSUPP;
+}
+
 void __init clock_probe(void)
 {
 	struct linux_prom_registers clk_reg[2];
@@ -702,14 +774,14 @@ void __init clock_probe(void)
 
 
 	if (this_is_starfire) {
-		/* davem suggests we keep this within the 4M locked kernel image */
-		static char obp_gettod[256];
-		static u32 unix_tod;
-
-		sprintf(obp_gettod, "h# %08x unix-gettod",
-			(unsigned int) (long) &unix_tod);
-		prom_feval(obp_gettod);
-		xtime.tv_sec = unix_tod;
+		xtime.tv_sec = starfire_get_time();
+		xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+		set_normalized_timespec(&wall_to_monotonic,
+		                        -xtime.tv_sec, -xtime.tv_nsec);
+		return;
+	}
+	if (tlb_type == hypervisor) {
+		xtime.tv_sec = hypervisor_get_time();
 		xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
 		set_normalized_timespec(&wall_to_monotonic,
 		                        -xtime.tv_sec, -xtime.tv_nsec);
@@ -981,11 +1053,10 @@ static void sparc64_start_timers(irqreturn_t (*cfunc)(int, void *, struct pt_reg
 }
 
 struct freq_table {
-	unsigned long udelay_val_ref;
 	unsigned long clock_tick_ref;
 	unsigned int ref_freq;
 };
-static DEFINE_PER_CPU(struct freq_table, sparc64_freq_table) = { 0, 0, 0 };
+static DEFINE_PER_CPU(struct freq_table, sparc64_freq_table) = { 0, 0 };
 
 unsigned long sparc64_get_clock_tick(unsigned int cpu)
 {
@@ -1007,16 +1078,11 @@ static int sparc64_cpufreq_notifier(struct notifier_block *nb, unsigned long val
 
 	if (!ft->ref_freq) {
 		ft->ref_freq = freq->old;
-		ft->udelay_val_ref = cpu_data(cpu).udelay_val;
 		ft->clock_tick_ref = cpu_data(cpu).clock_tick;
 	}
 	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
 	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
 	    (val == CPUFREQ_RESUMECHANGE)) {
-		cpu_data(cpu).udelay_val =
-			cpufreq_scale(ft->udelay_val_ref,
-				      ft->ref_freq,
-				      freq->new);
 		cpu_data(cpu).clock_tick =
 			cpufreq_scale(ft->clock_tick_ref,
 				      ft->ref_freq,
@@ -1179,3 +1245,246 @@ static int set_rtc_mmss(unsigned long nowtime)
 		return retval;
 	}
 }
+
+#define RTC_IS_OPEN		0x01	/* means /dev/rtc is in use	*/
+static unsigned char mini_rtc_status;	/* bitmapped status byte.	*/
+
+/* months start at 0 now */
+static unsigned char days_in_mo[] =
+{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
+
+#define FEBRUARY	2
+#define	STARTOFTIME	1970
+#define SECDAY		86400L
+#define SECYR		(SECDAY * 365)
+#define	leapyear(year)		((year) % 4 == 0 && \
+				 ((year) % 100 != 0 || (year) % 400 == 0))
+#define	days_in_year(a) 	(leapyear(a) ? 366 : 365)
+#define	days_in_month(a) 	(month_days[(a) - 1])
+
+static int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+/*
+ * This only works for the Gregorian calendar - i.e. after 1752 (in the UK)
+ */
+static void GregorianDay(struct rtc_time * tm)
+{
+	int leapsToDate;
+	int lastYear;
+	int day;
+	int MonthOffset[] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 };
+
+	lastYear = tm->tm_year - 1;
+
+	/*
+	 * Number of leap corrections to apply up to end of last year
+	 */
+	leapsToDate = lastYear / 4 - lastYear / 100 + lastYear / 400;
+
+	/*
+	 * This year is a leap year if it is divisible by 4 except when it is
+	 * divisible by 100 unless it is divisible by 400
+	 *
+	 * e.g. 1904 was a leap year, 1900 was not, 1996 is, and 2000 was
+	 */
+	day = tm->tm_mon > 2 && leapyear(tm->tm_year);
+
+	day += lastYear*365 + leapsToDate + MonthOffset[tm->tm_mon-1] +
+		   tm->tm_mday;
+
+	tm->tm_wday = day % 7;
+}
+
+static void to_tm(int tim, struct rtc_time *tm)
+{
+	register int    i;
+	register long   hms, day;
+
+	day = tim / SECDAY;
+	hms = tim % SECDAY;
+
+	/* Hours, minutes, seconds are easy */
+	tm->tm_hour = hms / 3600;
+	tm->tm_min = (hms % 3600) / 60;
+	tm->tm_sec = (hms % 3600) % 60;
+
+	/* Number of years in days */
+	for (i = STARTOFTIME; day >= days_in_year(i); i++)
+		day -= days_in_year(i);
+	tm->tm_year = i;
+
+	/* Number of months in days left */
+	if (leapyear(tm->tm_year))
+		days_in_month(FEBRUARY) = 29;
+	for (i = 1; day >= days_in_month(i); i++)
+		day -= days_in_month(i);
+	days_in_month(FEBRUARY) = 28;
+	tm->tm_mon = i;
+
+	/* Days are what is left over (+1) from all that. */
+	tm->tm_mday = day + 1;
+
+	/*
+	 * Determine the day of week
+	 */
+	GregorianDay(tm);
+}
+
+/* Both Starfire and SUN4V give us seconds since Jan 1st, 1970,
+ * aka Unix time.  So we have to convert to/from rtc_time.
+ */
+static inline void mini_get_rtc_time(struct rtc_time *time)
+{
+	unsigned long flags;
+	u32 seconds;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	seconds = 0;
+	if (this_is_starfire)
+		seconds = starfire_get_time();
+	else if (tlb_type == hypervisor)
+		seconds = hypervisor_get_time();
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	to_tm(seconds, time);
+	time->tm_year -= 1900;
+	time->tm_mon -= 1;
+}
+
+static inline int mini_set_rtc_time(struct rtc_time *time)
+{
+	u32 seconds = mktime(time->tm_year + 1900, time->tm_mon + 1,
+			     time->tm_mday, time->tm_hour,
+			     time->tm_min, time->tm_sec);
+	unsigned long flags;
+	int err;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	err = -ENODEV;
+	if (this_is_starfire)
+		err = starfire_set_time(seconds);
+	else  if (tlb_type == hypervisor)
+		err = hypervisor_set_time(seconds);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	return err;
+}
+
+static int mini_rtc_ioctl(struct inode *inode, struct file *file,
+			  unsigned int cmd, unsigned long arg)
+{
+	struct rtc_time wtime;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+
+	case RTC_PLL_GET:
+		return -EINVAL;
+
+	case RTC_PLL_SET:
+		return -EINVAL;
+
+	case RTC_UIE_OFF:	/* disable ints from RTC updates.	*/
+		return 0;
+
+	case RTC_UIE_ON:	/* enable ints for RTC updates.	*/
+	        return -EINVAL;
+
+	case RTC_RD_TIME:	/* Read the time/date from RTC	*/
+		/* this doesn't get week-day, who cares */
+		memset(&wtime, 0, sizeof(wtime));
+		mini_get_rtc_time(&wtime);
+
+		return copy_to_user(argp, &wtime, sizeof(wtime)) ? -EFAULT : 0;
+
+	case RTC_SET_TIME:	/* Set the RTC */
+	    {
+		int year;
+		unsigned char leap_yr;
+
+		if (!capable(CAP_SYS_TIME))
+			return -EACCES;
+
+		if (copy_from_user(&wtime, argp, sizeof(wtime)))
+			return -EFAULT;
+
+		year = wtime.tm_year + 1900;
+		leap_yr = ((!(year % 4) && (year % 100)) ||
+			   !(year % 400));
+
+		if ((wtime.tm_mon < 0 || wtime.tm_mon > 11) || (wtime.tm_mday < 1))
+			return -EINVAL;
+
+		if (wtime.tm_mday < 0 || wtime.tm_mday >
+		    (days_in_mo[wtime.tm_mon] + ((wtime.tm_mon == 1) && leap_yr)))
+			return -EINVAL;
+
+		if (wtime.tm_hour < 0 || wtime.tm_hour >= 24 ||
+		    wtime.tm_min < 0 || wtime.tm_min >= 60 ||
+		    wtime.tm_sec < 0 || wtime.tm_sec >= 60)
+			return -EINVAL;
+
+		return mini_set_rtc_time(&wtime);
+	    }
+	}
+
+	return -EINVAL;
+}
+
+static int mini_rtc_open(struct inode *inode, struct file *file)
+{
+	if (mini_rtc_status & RTC_IS_OPEN)
+		return -EBUSY;
+
+	mini_rtc_status |= RTC_IS_OPEN;
+
+	return 0;
+}
+
+static int mini_rtc_release(struct inode *inode, struct file *file)
+{
+	mini_rtc_status &= ~RTC_IS_OPEN;
+	return 0;
+}
+
+
+static struct file_operations mini_rtc_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= mini_rtc_ioctl,
+	.open		= mini_rtc_open,
+	.release	= mini_rtc_release,
+};
+
+static struct miscdevice rtc_mini_dev =
+{
+	.minor		= RTC_MINOR,
+	.name		= "rtc",
+	.fops		= &mini_rtc_fops,
+};
+
+static int __init rtc_mini_init(void)
+{
+	int retval;
+
+	if (tlb_type != hypervisor && !this_is_starfire)
+		return -ENODEV;
+
+	printk(KERN_INFO "Mini RTC Driver\n");
+
+	retval = misc_register(&rtc_mini_dev);
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
+static void __exit rtc_mini_exit(void)
+{
+	misc_deregister(&rtc_mini_dev);
+}
+
+
+module_init(rtc_mini_init);
+module_exit(rtc_mini_exit);
diff --git a/arch/sparc64/kernel/trampoline.S b/arch/sparc64/kernel/trampoline.S
index 9478551cb020..a4dc01a3d238 100644
--- a/arch/sparc64/kernel/trampoline.S
+++ b/arch/sparc64/kernel/trampoline.S
@@ -16,6 +16,8 @@
 #include <asm/processor.h>
 #include <asm/thread_info.h>
 #include <asm/mmu.h>
+#include <asm/hypervisor.h>
+#include <asm/cpudata.h>
 
 	.data
 	.align	8
@@ -28,14 +30,19 @@ itlb_load:
 dtlb_load:
 	.asciz	"SUNW,dtlb-load"
 
+	/* XXX __cpuinit this thing XXX */
+#define TRAMP_STACK_SIZE	1024
+	.align	16
+tramp_stack:
+	.skip	TRAMP_STACK_SIZE
+
 	.text
 	.align		8
 	.globl		sparc64_cpu_startup, sparc64_cpu_startup_end
 sparc64_cpu_startup:
-	flushw
-
-	BRANCH_IF_CHEETAH_BASE(g1,g5,cheetah_startup)
-	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1,g5,cheetah_plus_startup)
+	BRANCH_IF_SUN4V(g1, niagara_startup)
+	BRANCH_IF_CHEETAH_BASE(g1, g5, cheetah_startup)
+	BRANCH_IF_CHEETAH_PLUS_OR_FOLLOWON(g1, g5, cheetah_plus_startup)
 
 	ba,pt	%xcc, spitfire_startup
 	 nop
@@ -55,6 +62,7 @@ cheetah_startup:
 	or	%g5, DCU_DM | DCU_IM | DCU_DC | DCU_IC, %g5
 	stxa	%g5, [%g0] ASI_DCU_CONTROL_REG
 	membar	#Sync
+	/* fallthru */
 
 cheetah_generic_startup:
 	mov	TSB_EXTENSION_P, %g3
@@ -70,7 +78,9 @@ cheetah_generic_startup:
 	stxa	%g0, [%g3] ASI_DMMU
 	stxa	%g0, [%g3] ASI_IMMU
 	membar	#Sync
+	/* fallthru */
 
+niagara_startup:
 	/* Disable STICK_INT interrupts. */
 	sethi		%hi(0x80000000), %g5
 	sllx		%g5, 32, %g5
@@ -85,17 +95,17 @@ spitfire_startup:
 	membar		#Sync
 
 startup_continue:
-	wrpr		%g0, 15, %pil
-
 	sethi		%hi(0x80000000), %g2
 	sllx		%g2, 32, %g2
 	wr		%g2, 0, %tick_cmpr
 
+	mov		%o0, %l0
+
+	BRANCH_IF_SUN4V(g1, niagara_lock_tlb)
+
 	/* Call OBP by hand to lock KERNBASE into i/d tlbs.
 	 * We lock 2 consequetive entries if we are 'bigkernel'.
 	 */
-	mov		%o0, %l0
-
 	sethi		%hi(prom_entry_lock), %g2
 1:	ldstub		[%g2 + %lo(prom_entry_lock)], %g1
 	membar		#StoreLoad | #StoreStore
@@ -105,7 +115,6 @@ startup_continue:
 	sethi		%hi(p1275buf), %g2
 	or		%g2, %lo(p1275buf), %g2
 	ldx		[%g2 + 0x10], %l2
-	mov		%sp, %l1
 	add		%l2, -(192 + 128), %sp
 	flushw
 
@@ -142,8 +151,7 @@ startup_continue:
 
 	sethi		%hi(bigkernel), %g2
 	lduw		[%g2 + %lo(bigkernel)], %g2
-	cmp		%g2, 0
-	be,pt		%icc, do_dtlb
+	brz,pt		%g2, do_dtlb
 	 nop
 
 	sethi		%hi(call_method), %g2
@@ -214,8 +222,7 @@ do_dtlb:
 
 	sethi		%hi(bigkernel), %g2
 	lduw		[%g2 + %lo(bigkernel)], %g2
-	cmp		%g2, 0
-	be,pt		%icc, do_unlock
+	brz,pt		%g2, do_unlock
 	 nop
 
 	sethi		%hi(call_method), %g2
@@ -257,99 +264,180 @@ do_unlock:
 	stb		%g0, [%g2 + %lo(prom_entry_lock)]
 	membar		#StoreStore | #StoreLoad
 
-	mov		%l1, %sp
-	flushw
+	ba,pt		%xcc, after_lock_tlb
+	 nop
+
+niagara_lock_tlb:
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	mov		HV_MMU_IMMU, %o3
+	ta		HV_FAST_TRAP
+
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	mov		HV_MMU_DMMU, %o3
+	ta		HV_FAST_TRAP
 
-	mov		%l0, %o0
+	sethi		%hi(bigkernel), %g2
+	lduw		[%g2 + %lo(bigkernel)], %g2
+	brz,pt		%g2, after_lock_tlb
+	 nop
 
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE + 0x400000), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	sethi		%hi(0x400000), %o3
+	add		%o2, %o3, %o2
+	mov		HV_MMU_IMMU, %o3
+	ta		HV_FAST_TRAP
+
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	sethi		%hi(KERNBASE + 0x400000), %o0
+	clr		%o1
+	sethi		%hi(kern_locked_tte_data), %o2
+	ldx		[%o2 + %lo(kern_locked_tte_data)], %o2
+	sethi		%hi(0x400000), %o3
+	add		%o2, %o3, %o2
+	mov		HV_MMU_DMMU, %o3
+	ta		HV_FAST_TRAP
+
+after_lock_tlb:
 	wrpr		%g0, (PSTATE_PRIV | PSTATE_PEF), %pstate
 	wr		%g0, 0, %fprs
 
-	/* XXX Buggy PROM... */
-	srl		%o0, 0, %o0
-	ldx		[%o0], %g6
-
 	wr		%g0, ASI_P, %asi
 
 	mov		PRIMARY_CONTEXT, %g7
-	stxa		%g0, [%g7] ASI_DMMU
+
+661:	stxa		%g0, [%g7] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g0, [%g7] ASI_MMU
+	.previous
+
 	membar		#Sync
 	mov		SECONDARY_CONTEXT, %g7
-	stxa		%g0, [%g7] ASI_DMMU
+
+661:	stxa		%g0, [%g7] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g0, [%g7] ASI_MMU
+	.previous
+
 	membar		#Sync
 
-	mov		1, %g5
-	sllx		%g5, THREAD_SHIFT, %g5
-	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
-	add		%g6, %g5, %sp
+	/* Everything we do here, until we properly take over the
+	 * trap table, must be done with extreme care.  We cannot
+	 * make any references to %g6 (current thread pointer),
+	 * %g4 (current task pointer), or %g5 (base of current cpu's
+	 * per-cpu area) until we properly take over the trap table
+	 * from the firmware and hypervisor.
+	 *
+	 * Get onto temporary stack which is in the locked kernel image.
+	 */
+	sethi		%hi(tramp_stack), %g1
+	or		%g1, %lo(tramp_stack), %g1
+	add		%g1, TRAMP_STACK_SIZE, %g1
+	sub		%g1, STACKFRAME_SZ + STACK_BIAS, %sp
 	mov		0, %fp
 
-	wrpr		%g0, 0, %wstate
-	wrpr		%g0, 0, %tl
+	/* Put garbage in these registers to trap any access to them.  */
+	set		0xdeadbeef, %g4
+	set		0xdeadbeef, %g5
+	set		0xdeadbeef, %g6
 
-	/* Setup the trap globals, then we can resurface. */
-	rdpr		%pstate, %o1
-	mov		%g6, %o2
-	wrpr		%o1, PSTATE_AG, %pstate
-	sethi		%hi(sparc64_ttable_tl0), %g5
-	wrpr		%g5, %tba
-	mov		%o2, %g6
-
-	wrpr		%o1, PSTATE_MG, %pstate
-#define KERN_HIGHBITS		((_PAGE_VALID|_PAGE_SZ4MB)^0xfffff80000000000)
-#define KERN_LOWBITS		(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_W)
-
-	mov		TSB_REG, %g1
-	stxa		%g0, [%g1] ASI_DMMU
-	membar		#Sync
-	mov		TLB_SFSR, %g1
-	sethi		%uhi(KERN_HIGHBITS), %g2
-	or		%g2, %ulo(KERN_HIGHBITS), %g2
-	sllx		%g2, 32, %g2
-	or		%g2, KERN_LOWBITS, %g2
+	call		init_irqwork_curcpu
+	 nop
 
-	BRANCH_IF_ANY_CHEETAH(g3,g7,9f)
+	sethi		%hi(tlb_type), %g3
+	lduw		[%g3 + %lo(tlb_type)], %g2
+	cmp		%g2, 3
+	bne,pt		%icc, 1f
+	 nop
 
-	ba,pt		%xcc, 1f
+	call		hard_smp_processor_id
 	 nop
+	
+	mov		%o0, %o1
+	mov		0, %o0
+	mov		0, %o2
+	call		sun4v_init_mondo_queues
+	 mov		1, %o3
 
-9:
-	sethi		%uhi(VPTE_BASE_CHEETAH), %g3
-	or		%g3, %ulo(VPTE_BASE_CHEETAH), %g3
-	ba,pt		%xcc, 2f
-	 sllx		%g3, 32, %g3
-1:
-	sethi		%uhi(VPTE_BASE_SPITFIRE), %g3
-	or		%g3, %ulo(VPTE_BASE_SPITFIRE), %g3
-	sllx		%g3, 32, %g3
+1:	call		init_cur_cpu_trap
+	 ldx		[%l0], %o0
+
+	/* Start using proper page size encodings in ctx register.  */
+	sethi		%hi(sparc64_kern_pri_context), %g3
+	ldx		[%g3 + %lo(sparc64_kern_pri_context)], %g2
+	mov		PRIMARY_CONTEXT, %g1
 
-2:
-	clr	%g7
-#undef KERN_HIGHBITS
-#undef KERN_LOWBITS
+661:	stxa		%g2, [%g1] ASI_DMMU
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	stxa		%g2, [%g1] ASI_MMU
+	.previous
 
-	wrpr		%o1, 0x0, %pstate
-	ldx		[%g6 + TI_TASK], %g4
+	membar		#Sync
 
 	wrpr		%g0, 0, %wstate
 
-	call		init_irqwork_curcpu
+	/* As a hack, put &init_thread_union into %g6.
+	 * prom_world() loads from here to restore the %asi
+	 * register.
+	 */
+	sethi		%hi(init_thread_union), %g6
+	or		%g6, %lo(init_thread_union), %g6
+
+	sethi		%hi(is_sun4v), %o0
+	lduw		[%o0 + %lo(is_sun4v)], %o0
+	brz,pt		%o0, 1f
 	 nop
 
-	/* Start using proper page size encodings in ctx register.  */
-	sethi	%hi(sparc64_kern_pri_context), %g3
-	ldx	[%g3 + %lo(sparc64_kern_pri_context)], %g2
-	mov	PRIMARY_CONTEXT, %g1
-	stxa	%g2, [%g1] ASI_DMMU
-	membar	#Sync
+	TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
+	add		%g2, TRAP_PER_CPU_FAULT_INFO, %g2
+	stxa		%g2, [%g0] ASI_SCRATCHPAD
+
+	/* Compute physical address:
+	 *
+	 * paddr = kern_base + (mmfsa_vaddr - KERNBASE)
+	 */
+	sethi		%hi(KERNBASE), %g3
+	sub		%g2, %g3, %g2
+	sethi		%hi(kern_base), %g3
+	ldx		[%g3 + %lo(kern_base)], %g3
+	add		%g2, %g3, %o1
+
+	call		prom_set_trap_table_sun4v
+	 sethi		%hi(sparc64_ttable_tl0), %o0
+
+	ba,pt		%xcc, 2f
+	 nop
+
+1:	call		prom_set_trap_table
+	 sethi		%hi(sparc64_ttable_tl0), %o0
+
+2:	ldx		[%l0], %g6
+	ldx		[%g6 + TI_TASK], %g4
+
+	mov		1, %g5
+	sllx		%g5, THREAD_SHIFT, %g5
+	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
+	add		%g6, %g5, %sp
+	mov		0, %fp
 
 	rdpr		%pstate, %o1
 	or		%o1, PSTATE_IE, %o1
 	wrpr		%o1, 0, %pstate
 
-	call		prom_set_trap_table
-	 sethi		%hi(sparc64_ttable_tl0), %o0
-
 	call		smp_callin
 	 nop
 	call		cpu_idle
diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c
index 8d44ae5a15e3..ff090bb9734b 100644
--- a/arch/sparc64/kernel/traps.c
+++ b/arch/sparc64/kernel/traps.c
@@ -38,22 +38,24 @@
 #include <asm/processor.h>
 #include <asm/timer.h>
 #include <asm/kdebug.h>
+#include <asm/head.h>
 #ifdef CONFIG_KMOD
 #include <linux/kmod.h>
 #endif
 
-struct notifier_block *sparc64die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+ATOMIC_NOTIFIER_HEAD(sparc64die_chain);
 
 int register_die_notifier(struct notifier_block *nb)
 {
-	int err = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&die_notifier_lock, flags);
-	err = notifier_chain_register(&sparc64die_chain, nb);
-	spin_unlock_irqrestore(&die_notifier_lock, flags);
-	return err;
+	return atomic_notifier_chain_register(&sparc64die_chain, nb);
 }
+EXPORT_SYMBOL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+	return atomic_notifier_chain_unregister(&sparc64die_chain, nb);
+}
+EXPORT_SYMBOL(unregister_die_notifier);
 
 /* When an irrecoverable trap occurs at tl > 0, the trap entry
  * code logs the trap state registers at every level in the trap
@@ -72,12 +74,14 @@ struct tl1_traplog {
 
 static void dump_tl1_traplog(struct tl1_traplog *p)
 {
-	int i;
+	int i, limit;
+
+	printk(KERN_EMERG "TRAPLOG: Error at trap level 0x%lx, "
+	       "dumping track stack.\n", p->tl);
 
-	printk("TRAPLOG: Error at trap level 0x%lx, dumping track stack.\n",
-	       p->tl);
-	for (i = 0; i < 4; i++) {
-		printk(KERN_CRIT
+	limit = (tlb_type == hypervisor) ? 2 : 4;
+	for (i = 0; i < limit; i++) {
+		printk(KERN_EMERG
 		       "TRAPLOG: Trap level %d TSTATE[%016lx] TPC[%016lx] "
 		       "TNPC[%016lx] TT[%lx]\n",
 		       i + 1,
@@ -179,6 +183,45 @@ void spitfire_insn_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr
 	spitfire_insn_access_exception(regs, sfsr, sfar);
 }
 
+void sun4v_insn_access_exception(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	unsigned short type = (type_ctx >> 16);
+	unsigned short ctx  = (type_ctx & 0xffff);
+	siginfo_t info;
+
+	if (notify_die(DIE_TRAP, "instruction access exception", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		printk("sun4v_insn_access_exception: ADDR[%016lx] "
+		       "CTX[%04x] TYPE[%04x], going.\n",
+		       addr, ctx, type);
+		die_if_kernel("Iax", regs);
+	}
+
+	if (test_thread_flag(TIF_32BIT)) {
+		regs->tpc &= 0xffffffff;
+		regs->tnpc &= 0xffffffff;
+	}
+	info.si_signo = SIGSEGV;
+	info.si_errno = 0;
+	info.si_code = SEGV_MAPERR;
+	info.si_addr = (void __user *) addr;
+	info.si_trapno = 0;
+	force_sig_info(SIGSEGV, &info, current);
+}
+
+void sun4v_insn_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	if (notify_die(DIE_TRAP_TL1, "instruction access exception tl1", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+	sun4v_insn_access_exception(regs, addr, type_ctx);
+}
+
 void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
 {
 	siginfo_t info;
@@ -227,6 +270,45 @@ void spitfire_data_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr
 	spitfire_data_access_exception(regs, sfsr, sfar);
 }
 
+void sun4v_data_access_exception(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	unsigned short type = (type_ctx >> 16);
+	unsigned short ctx  = (type_ctx & 0xffff);
+	siginfo_t info;
+
+	if (notify_die(DIE_TRAP, "data access exception", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		printk("sun4v_data_access_exception: ADDR[%016lx] "
+		       "CTX[%04x] TYPE[%04x], going.\n",
+		       addr, ctx, type);
+		die_if_kernel("Dax", regs);
+	}
+
+	if (test_thread_flag(TIF_32BIT)) {
+		regs->tpc &= 0xffffffff;
+		regs->tnpc &= 0xffffffff;
+	}
+	info.si_signo = SIGSEGV;
+	info.si_errno = 0;
+	info.si_code = SEGV_MAPERR;
+	info.si_addr = (void __user *) addr;
+	info.si_trapno = 0;
+	force_sig_info(SIGSEGV, &info, current);
+}
+
+void sun4v_data_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	if (notify_die(DIE_TRAP_TL1, "data access exception tl1", regs,
+		       0, 0x8, SIGTRAP) == NOTIFY_STOP)
+		return;
+
+	dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+	sun4v_data_access_exception(regs, addr, type_ctx);
+}
+
 #ifdef CONFIG_PCI
 /* This is really pathetic... */
 extern volatile int pci_poke_in_progress;
@@ -788,7 +870,8 @@ void __init cheetah_ecache_flush_init(void)
 		cheetah_error_log[i].afsr = CHAFSR_INVALID;
 
 	__asm__ ("rdpr %%ver, %0" : "=r" (ver));
-	if ((ver >> 32) == 0x003e0016) {
+	if ((ver >> 32) == __JALAPENO_ID ||
+	    (ver >> 32) == __SERRANO_ID) {
 		cheetah_error_table = &__jalapeno_error_table[0];
 		cheetah_afsr_errors = JPAFSR_ERRORS;
 	} else if ((ver >> 32) == 0x003e0015) {
@@ -1666,6 +1749,238 @@ void cheetah_plus_parity_error(int type, struct pt_regs *regs)
 	       regs->tpc);
 }
 
+struct sun4v_error_entry {
+	u64		err_handle;
+	u64		err_stick;
+
+	u32		err_type;
+#define SUN4V_ERR_TYPE_UNDEFINED	0
+#define SUN4V_ERR_TYPE_UNCORRECTED_RES	1
+#define SUN4V_ERR_TYPE_PRECISE_NONRES	2
+#define SUN4V_ERR_TYPE_DEFERRED_NONRES	3
+#define SUN4V_ERR_TYPE_WARNING_RES	4
+
+	u32		err_attrs;
+#define SUN4V_ERR_ATTRS_PROCESSOR	0x00000001
+#define SUN4V_ERR_ATTRS_MEMORY		0x00000002
+#define SUN4V_ERR_ATTRS_PIO		0x00000004
+#define SUN4V_ERR_ATTRS_INT_REGISTERS	0x00000008
+#define SUN4V_ERR_ATTRS_FPU_REGISTERS	0x00000010
+#define SUN4V_ERR_ATTRS_USER_MODE	0x01000000
+#define SUN4V_ERR_ATTRS_PRIV_MODE	0x02000000
+#define SUN4V_ERR_ATTRS_RES_QUEUE_FULL	0x80000000
+
+	u64		err_raddr;
+	u32		err_size;
+	u16		err_cpu;
+	u16		err_pad;
+};
+
+static atomic_t sun4v_resum_oflow_cnt = ATOMIC_INIT(0);
+static atomic_t sun4v_nonresum_oflow_cnt = ATOMIC_INIT(0);
+
+static const char *sun4v_err_type_to_str(u32 type)
+{
+	switch (type) {
+	case SUN4V_ERR_TYPE_UNDEFINED:
+		return "undefined";
+	case SUN4V_ERR_TYPE_UNCORRECTED_RES:
+		return "uncorrected resumable";
+	case SUN4V_ERR_TYPE_PRECISE_NONRES:
+		return "precise nonresumable";
+	case SUN4V_ERR_TYPE_DEFERRED_NONRES:
+		return "deferred nonresumable";
+	case SUN4V_ERR_TYPE_WARNING_RES:
+		return "warning resumable";
+	default:
+		return "unknown";
+	};
+}
+
+static void sun4v_log_error(struct sun4v_error_entry *ent, int cpu, const char *pfx, atomic_t *ocnt)
+{
+	int cnt;
+
+	printk("%s: Reporting on cpu %d\n", pfx, cpu);
+	printk("%s: err_handle[%lx] err_stick[%lx] err_type[%08x:%s]\n",
+	       pfx,
+	       ent->err_handle, ent->err_stick,
+	       ent->err_type,
+	       sun4v_err_type_to_str(ent->err_type));
+	printk("%s: err_attrs[%08x:%s %s %s %s %s %s %s %s]\n",
+	       pfx,
+	       ent->err_attrs,
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_PROCESSOR) ?
+		"processor" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_MEMORY) ?
+		"memory" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_PIO) ?
+		"pio" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_INT_REGISTERS) ?
+		"integer-regs" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_FPU_REGISTERS) ?
+		"fpu-regs" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_USER_MODE) ?
+		"user" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_PRIV_MODE) ?
+		"privileged" : ""),
+	       ((ent->err_attrs & SUN4V_ERR_ATTRS_RES_QUEUE_FULL) ?
+		"queue-full" : ""));
+	printk("%s: err_raddr[%016lx] err_size[%u] err_cpu[%u]\n",
+	       pfx,
+	       ent->err_raddr, ent->err_size, ent->err_cpu);
+
+	if ((cnt = atomic_read(ocnt)) != 0) {
+		atomic_set(ocnt, 0);
+		wmb();
+		printk("%s: Queue overflowed %d times.\n",
+		       pfx, cnt);
+	}
+}
+
+/* We run with %pil set to 15 and PSTATE_IE enabled in %pstate.
+ * Log the event and clear the first word of the entry.
+ */
+void sun4v_resum_error(struct pt_regs *regs, unsigned long offset)
+{
+	struct sun4v_error_entry *ent, local_copy;
+	struct trap_per_cpu *tb;
+	unsigned long paddr;
+	int cpu;
+
+	cpu = get_cpu();
+
+	tb = &trap_block[cpu];
+	paddr = tb->resum_kernel_buf_pa + offset;
+	ent = __va(paddr);
+
+	memcpy(&local_copy, ent, sizeof(struct sun4v_error_entry));
+
+	/* We have a local copy now, so release the entry.  */
+	ent->err_handle = 0;
+	wmb();
+
+	put_cpu();
+
+	sun4v_log_error(&local_copy, cpu,
+			KERN_ERR "RESUMABLE ERROR",
+			&sun4v_resum_oflow_cnt);
+}
+
+/* If we try to printk() we'll probably make matters worse, by trying
+ * to retake locks this cpu already holds or causing more errors. So
+ * just bump a counter, and we'll report these counter bumps above.
+ */
+void sun4v_resum_overflow(struct pt_regs *regs)
+{
+	atomic_inc(&sun4v_resum_oflow_cnt);
+}
+
+/* We run with %pil set to 15 and PSTATE_IE enabled in %pstate.
+ * Log the event, clear the first word of the entry, and die.
+ */
+void sun4v_nonresum_error(struct pt_regs *regs, unsigned long offset)
+{
+	struct sun4v_error_entry *ent, local_copy;
+	struct trap_per_cpu *tb;
+	unsigned long paddr;
+	int cpu;
+
+	cpu = get_cpu();
+
+	tb = &trap_block[cpu];
+	paddr = tb->nonresum_kernel_buf_pa + offset;
+	ent = __va(paddr);
+
+	memcpy(&local_copy, ent, sizeof(struct sun4v_error_entry));
+
+	/* We have a local copy now, so release the entry.  */
+	ent->err_handle = 0;
+	wmb();
+
+	put_cpu();
+
+#ifdef CONFIG_PCI
+	/* Check for the special PCI poke sequence. */
+	if (pci_poke_in_progress && pci_poke_cpu == cpu) {
+		pci_poke_faulted = 1;
+		regs->tpc += 4;
+		regs->tnpc = regs->tpc + 4;
+		return;
+	}
+#endif
+
+	sun4v_log_error(&local_copy, cpu,
+			KERN_EMERG "NON-RESUMABLE ERROR",
+			&sun4v_nonresum_oflow_cnt);
+
+	panic("Non-resumable error.");
+}
+
+/* If we try to printk() we'll probably make matters worse, by trying
+ * to retake locks this cpu already holds or causing more errors. So
+ * just bump a counter, and we'll report these counter bumps above.
+ */
+void sun4v_nonresum_overflow(struct pt_regs *regs)
+{
+	/* XXX Actually even this can make not that much sense.  Perhaps
+	 * XXX we should just pull the plug and panic directly from here?
+	 */
+	atomic_inc(&sun4v_nonresum_oflow_cnt);
+}
+
+unsigned long sun4v_err_itlb_vaddr;
+unsigned long sun4v_err_itlb_ctx;
+unsigned long sun4v_err_itlb_pte;
+unsigned long sun4v_err_itlb_error;
+
+void sun4v_itlb_error_report(struct pt_regs *regs, int tl)
+{
+	if (tl > 1)
+		dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+
+	printk(KERN_EMERG "SUN4V-ITLB: Error at TPC[%lx], tl %d\n",
+	       regs->tpc, tl);
+	printk(KERN_EMERG "SUN4V-ITLB: vaddr[%lx] ctx[%lx] "
+	       "pte[%lx] error[%lx]\n",
+	       sun4v_err_itlb_vaddr, sun4v_err_itlb_ctx,
+	       sun4v_err_itlb_pte, sun4v_err_itlb_error);
+
+	prom_halt();
+}
+
+unsigned long sun4v_err_dtlb_vaddr;
+unsigned long sun4v_err_dtlb_ctx;
+unsigned long sun4v_err_dtlb_pte;
+unsigned long sun4v_err_dtlb_error;
+
+void sun4v_dtlb_error_report(struct pt_regs *regs, int tl)
+{
+	if (tl > 1)
+		dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
+
+	printk(KERN_EMERG "SUN4V-DTLB: Error at TPC[%lx], tl %d\n",
+	       regs->tpc, tl);
+	printk(KERN_EMERG "SUN4V-DTLB: vaddr[%lx] ctx[%lx] "
+	       "pte[%lx] error[%lx]\n",
+	       sun4v_err_dtlb_vaddr, sun4v_err_dtlb_ctx,
+	       sun4v_err_dtlb_pte, sun4v_err_dtlb_error);
+
+	prom_halt();
+}
+
+void hypervisor_tlbop_error(unsigned long err, unsigned long op)
+{
+	printk(KERN_CRIT "SUN4V: TLB hv call error %lu for op %lu\n",
+	       err, op);
+}
+
+void hypervisor_tlbop_error_xcall(unsigned long err, unsigned long op)
+{
+	printk(KERN_CRIT "SUN4V: XCALL TLB hv call error %lu for op %lu\n",
+	       err, op);
+}
+
 void do_fpe_common(struct pt_regs *regs)
 {
 	if (regs->tstate & TSTATE_PRIV) {
@@ -1924,10 +2239,11 @@ void die_if_kernel(char *str, struct pt_regs *regs)
 		}
 		user_instruction_dump ((unsigned int __user *) regs->tpc);
 	}
+#if 0
 #ifdef CONFIG_SMP
 	smp_report_regs();
 #endif
-                                                	
+#endif                                                	
 	if (regs->tstate & TSTATE_PRIV)
 		do_exit(SIGKILL);
 	do_exit(SIGSEGV);
@@ -1958,6 +2274,11 @@ void do_illegal_instruction(struct pt_regs *regs)
 		} else if ((insn & 0xc1580000) == 0xc1100000) /* LDQ/STQ */ {
 			if (handle_ldf_stq(insn, regs))
 				return;
+		} else if (tlb_type == hypervisor) {
+			extern int vis_emul(struct pt_regs *, unsigned int);
+
+			if (!vis_emul(regs, insn))
+				return;
 		}
 	}
 	info.si_signo = SIGILL;
@@ -1968,6 +2289,8 @@ void do_illegal_instruction(struct pt_regs *regs)
 	force_sig_info(SIGILL, &info, current);
 }
 
+extern void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn);
+
 void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr)
 {
 	siginfo_t info;
@@ -1977,13 +2300,7 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
 		return;
 
 	if (regs->tstate & TSTATE_PRIV) {
-		extern void kernel_unaligned_trap(struct pt_regs *regs,
-						  unsigned int insn, 
-						  unsigned long sfar,
-						  unsigned long sfsr);
-
-		kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc),
-				      sfar, sfsr);
+		kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc));
 		return;
 	}
 	info.si_signo = SIGBUS;
@@ -1994,6 +2311,26 @@ void mem_address_unaligned(struct pt_regs *regs, unsigned long sfar, unsigned lo
 	force_sig_info(SIGBUS, &info, current);
 }
 
+void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
+{
+	siginfo_t info;
+
+	if (notify_die(DIE_TRAP, "memory address unaligned", regs,
+		       0, 0x34, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		kernel_unaligned_trap(regs, *((unsigned int *)regs->tpc));
+		return;
+	}
+	info.si_signo = SIGBUS;
+	info.si_errno = 0;
+	info.si_code = BUS_ADRALN;
+	info.si_addr = (void __user *) addr;
+	info.si_trapno = 0;
+	force_sig_info(SIGBUS, &info, current);
+}
+
 void do_privop(struct pt_regs *regs)
 {
 	siginfo_t info;
@@ -2130,7 +2467,23 @@ void do_getpsr(struct pt_regs *regs)
 	}
 }
 
+struct trap_per_cpu trap_block[NR_CPUS];
+
+/* This can get invoked before sched_init() so play it super safe
+ * and use hard_smp_processor_id().
+ */
+void init_cur_cpu_trap(struct thread_info *t)
+{
+	int cpu = hard_smp_processor_id();
+	struct trap_per_cpu *p = &trap_block[cpu];
+
+	p->thread = t;
+	p->pgd_paddr = 0;
+}
+
 extern void thread_info_offsets_are_bolixed_dave(void);
+extern void trap_per_cpu_offsets_are_bolixed_dave(void);
+extern void tsb_config_offsets_are_bolixed_dave(void);
 
 /* Only invoked on boot processor. */
 void __init trap_init(void)
@@ -2154,7 +2507,6 @@ void __init trap_init(void)
 	    TI_KERN_CNTD0 != offsetof(struct thread_info, kernel_cntd0) ||
 	    TI_KERN_CNTD1 != offsetof(struct thread_info, kernel_cntd1) ||
 	    TI_PCR != offsetof(struct thread_info, pcr_reg) ||
-	    TI_CEE_STUFF != offsetof(struct thread_info, cee_stuff) ||
 	    TI_PRE_COUNT != offsetof(struct thread_info, preempt_count) ||
 	    TI_NEW_CHILD != offsetof(struct thread_info, new_child) ||
 	    TI_SYS_NOERROR != offsetof(struct thread_info, syscall_noerror) ||
@@ -2165,6 +2517,47 @@ void __init trap_init(void)
 	    (TI_FPREGS & (64 - 1)))
 		thread_info_offsets_are_bolixed_dave();
 
+	if (TRAP_PER_CPU_THREAD != offsetof(struct trap_per_cpu, thread) ||
+	    (TRAP_PER_CPU_PGD_PADDR !=
+	     offsetof(struct trap_per_cpu, pgd_paddr)) ||
+	    (TRAP_PER_CPU_CPU_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, cpu_mondo_pa)) ||
+	    (TRAP_PER_CPU_DEV_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, dev_mondo_pa)) ||
+	    (TRAP_PER_CPU_RESUM_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, resum_mondo_pa)) ||
+	    (TRAP_PER_CPU_RESUM_KBUF_PA !=
+	     offsetof(struct trap_per_cpu, resum_kernel_buf_pa)) ||
+	    (TRAP_PER_CPU_NONRESUM_MONDO_PA !=
+	     offsetof(struct trap_per_cpu, nonresum_mondo_pa)) ||
+	    (TRAP_PER_CPU_NONRESUM_KBUF_PA !=
+	     offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa)) ||
+	    (TRAP_PER_CPU_FAULT_INFO !=
+	     offsetof(struct trap_per_cpu, fault_info)) ||
+	    (TRAP_PER_CPU_CPU_MONDO_BLOCK_PA !=
+	     offsetof(struct trap_per_cpu, cpu_mondo_block_pa)) ||
+	    (TRAP_PER_CPU_CPU_LIST_PA !=
+	     offsetof(struct trap_per_cpu, cpu_list_pa)) ||
+	    (TRAP_PER_CPU_TSB_HUGE !=
+	     offsetof(struct trap_per_cpu, tsb_huge)) ||
+	    (TRAP_PER_CPU_TSB_HUGE_TEMP !=
+	     offsetof(struct trap_per_cpu, tsb_huge_temp)))
+		trap_per_cpu_offsets_are_bolixed_dave();
+
+	if ((TSB_CONFIG_TSB !=
+	     offsetof(struct tsb_config, tsb)) ||
+	    (TSB_CONFIG_RSS_LIMIT !=
+	     offsetof(struct tsb_config, tsb_rss_limit)) ||
+	    (TSB_CONFIG_NENTRIES !=
+	     offsetof(struct tsb_config, tsb_nentries)) ||
+	    (TSB_CONFIG_REG_VAL !=
+	     offsetof(struct tsb_config, tsb_reg_val)) ||
+	    (TSB_CONFIG_MAP_VADDR !=
+	     offsetof(struct tsb_config, tsb_map_vaddr)) ||
+	    (TSB_CONFIG_MAP_PTE !=
+	     offsetof(struct tsb_config, tsb_map_pte)))
+		tsb_config_offsets_are_bolixed_dave();
+
 	/* Attach to the address space of init_task.  On SMP we
 	 * do this in smp.c:smp_callin for other cpus.
 	 */
diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S
new file mode 100644
index 000000000000..a0c8ba58920b
--- /dev/null
+++ b/arch/sparc64/kernel/tsb.S
@@ -0,0 +1,552 @@
+/* tsb.S: Sparc64 TSB table handling.
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/config.h>
+
+#include <asm/tsb.h>
+#include <asm/hypervisor.h>
+#include <asm/page.h>
+#include <asm/cpudata.h>
+#include <asm/mmu.h>
+
+	.text
+	.align	32
+
+	/* Invoked from TLB miss handler, we are in the
+	 * MMU global registers and they are setup like
+	 * this:
+	 *
+	 * %g1: TSB entry pointer
+	 * %g2:	available temporary
+	 * %g3:	FAULT_CODE_{D,I}TLB
+	 * %g4:	available temporary
+	 * %g5:	available temporary
+	 * %g6: TAG TARGET
+	 * %g7:	available temporary, will be loaded by us with
+	 *      the physical address base of the linux page
+	 *      tables for the current address space
+	 */
+tsb_miss_dtlb:
+	mov		TLB_TAG_ACCESS, %g4
+	ba,pt		%xcc, tsb_miss_page_table_walk
+	 ldxa		[%g4] ASI_DMMU, %g4
+
+tsb_miss_itlb:
+	mov		TLB_TAG_ACCESS, %g4
+	ba,pt		%xcc, tsb_miss_page_table_walk
+	 ldxa		[%g4] ASI_IMMU, %g4
+
+	/* At this point we have:
+	 * %g1 --	PAGE_SIZE TSB entry address
+	 * %g3 --	FAULT_CODE_{D,I}TLB
+	 * %g4 --	missing virtual address
+	 * %g6 --	TAG TARGET (vaddr >> 22)
+	 */
+tsb_miss_page_table_walk:
+	TRAP_LOAD_TRAP_BLOCK(%g7, %g5)
+
+	/* Before committing to a full page table walk,
+	 * check the huge page TSB.
+	 */
+#ifdef CONFIG_HUGETLB_PAGE
+
+661:	ldx		[%g7 + TRAP_PER_CPU_TSB_HUGE], %g5
+	nop
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	mov		SCRATCHPAD_UTSBREG2, %g5
+	ldxa		[%g5] ASI_SCRATCHPAD, %g5
+	.previous
+
+	cmp		%g5, -1
+	be,pt		%xcc, 80f
+	 nop
+
+	/* We need an aligned pair of registers containing 2 values
+	 * which can be easily rematerialized.  %g6 and %g7 foot the
+	 * bill just nicely.  We'll save %g6 away into %g2 for the
+	 * huge page TSB TAG comparison.
+	 *
+	 * Perform a huge page TSB lookup.
+	 */
+	mov		%g6, %g2
+	and		%g5, 0x7, %g6
+	mov		512, %g7
+	andn		%g5, 0x7, %g5
+	sllx		%g7, %g6, %g7
+	srlx		%g4, HPAGE_SHIFT, %g6
+	sub		%g7, 1, %g7
+	and		%g6, %g7, %g6
+	sllx		%g6, 4, %g6
+	add		%g5, %g6, %g5
+
+	TSB_LOAD_QUAD(%g5, %g6)
+	cmp		%g6, %g2
+	be,a,pt		%xcc, tsb_tlb_reload
+	 mov		%g7, %g5
+
+	/* No match, remember the huge page TSB entry address,
+	 * and restore %g6 and %g7.
+	 */
+	TRAP_LOAD_TRAP_BLOCK(%g7, %g6)
+	srlx		%g4, 22, %g6
+80:	stx		%g5, [%g7 + TRAP_PER_CPU_TSB_HUGE_TEMP]
+
+#endif
+
+	ldx		[%g7 + TRAP_PER_CPU_PGD_PADDR], %g7
+
+	/* At this point we have:
+	 * %g1 --	TSB entry address
+	 * %g3 --	FAULT_CODE_{D,I}TLB
+	 * %g4 --	missing virtual address
+	 * %g6 --	TAG TARGET (vaddr >> 22)
+	 * %g7 --	page table physical address
+	 *
+	 * We know that both the base PAGE_SIZE TSB and the HPAGE_SIZE
+	 * TSB both lack a matching entry.
+	 */
+tsb_miss_page_table_walk_sun4v_fastpath:
+	USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault)
+
+	/* Load and check PTE.  */
+	ldxa		[%g5] ASI_PHYS_USE_EC, %g5
+	brgez,pn	%g5, tsb_do_fault
+	 nop
+
+#ifdef CONFIG_HUGETLB_PAGE
+661:	sethi		%uhi(_PAGE_SZALL_4U), %g7
+	sllx		%g7, 32, %g7
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	mov		_PAGE_SZALL_4V, %g7
+	nop
+	.previous
+
+	and		%g5, %g7, %g2
+
+661:	sethi		%uhi(_PAGE_SZHUGE_4U), %g7
+	sllx		%g7, 32, %g7
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	mov		_PAGE_SZHUGE_4V, %g7
+	nop
+	.previous
+
+	cmp		%g2, %g7
+	bne,pt		%xcc, 60f
+	 nop
+
+	/* It is a huge page, use huge page TSB entry address we
+	 * calculated above.
+	 */
+	TRAP_LOAD_TRAP_BLOCK(%g7, %g2)
+	ldx		[%g7 + TRAP_PER_CPU_TSB_HUGE_TEMP], %g2
+	cmp		%g2, -1
+	movne		%xcc, %g2, %g1
+60:
+#endif
+
+	/* At this point we have:
+	 * %g1 --	TSB entry address
+	 * %g3 --	FAULT_CODE_{D,I}TLB
+	 * %g5 --	valid PTE
+	 * %g6 --	TAG TARGET (vaddr >> 22)
+	 */
+tsb_reload:
+	TSB_LOCK_TAG(%g1, %g2, %g7)
+	TSB_WRITE(%g1, %g5, %g6)
+
+	/* Finally, load TLB and return from trap.  */
+tsb_tlb_reload:
+	cmp		%g3, FAULT_CODE_DTLB
+	bne,pn		%xcc, tsb_itlb_load
+	 nop
+
+tsb_dtlb_load:
+
+661:	stxa		%g5, [%g0] ASI_DTLB_DATA_IN
+	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_DTLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_dtlb_load
+	 mov		%g5, %g3
+
+tsb_itlb_load:
+	/* Executable bit must be set.  */
+661:	andcc		%g5, _PAGE_EXEC_4U, %g0
+	.section	.sun4v_1insn_patch, "ax"
+	.word		661b
+	andcc		%g5, _PAGE_EXEC_4V, %g0
+	.previous
+
+	be,pn		%xcc, tsb_do_fault
+	 nop
+
+661:	stxa		%g5, [%g0] ASI_ITLB_DATA_IN
+	retry
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
+	/* For sun4v the ASI_ITLB_DATA_IN store and the retry
+	 * instruction get nop'd out and we get here to branch
+	 * to the sun4v tlb load code.  The registers are setup
+	 * as follows:
+	 *
+	 * %g4: vaddr
+	 * %g5: PTE
+	 * %g6:	TAG
+	 *
+	 * The sun4v TLB load wants the PTE in %g3 so we fix that
+	 * up here.
+	 */
+	ba,pt		%xcc, sun4v_itlb_load
+	 mov		%g5, %g3
+
+	/* No valid entry in the page tables, do full fault
+	 * processing.
+	 */
+
+	.globl		tsb_do_fault
+tsb_do_fault:
+	cmp		%g3, FAULT_CODE_DTLB
+
+661:	rdpr		%pstate, %g5
+	wrpr		%g5, PSTATE_AG | PSTATE_MG, %pstate
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	SET_GL(1)
+	ldxa		[%g0] ASI_SCRATCHPAD, %g4
+	.previous
+
+	bne,pn		%xcc, tsb_do_itlb_fault
+	 nop
+
+tsb_do_dtlb_fault:
+	rdpr	%tl, %g3
+	cmp	%g3, 1
+
+661:	mov	TLB_TAG_ACCESS, %g4
+	ldxa	[%g4] ASI_DMMU, %g5
+	.section .sun4v_2insn_patch, "ax"
+	.word	661b
+	ldx	[%g4 + HV_FAULT_D_ADDR_OFFSET], %g5
+	nop
+	.previous
+
+	be,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_DTLB, %g4
+	ba,pt	%xcc, winfix_trampoline
+	 nop
+
+tsb_do_itlb_fault:
+	rdpr	%tpc, %g5
+	ba,pt	%xcc, sparc64_realfault_common
+	 mov	FAULT_CODE_ITLB, %g4
+
+	.globl	sparc64_realfault_common
+sparc64_realfault_common:
+	/* fault code in %g4, fault address in %g5, etrap will
+	 * preserve these two values in %l4 and %l5 respectively
+	 */
+	ba,pt	%xcc, etrap			! Save trap state
+1:	 rd	%pc, %g7			! ...
+	stb	%l4, [%g6 + TI_FAULT_CODE]	! Save fault code
+	stx	%l5, [%g6 + TI_FAULT_ADDR]	! Save fault address
+	call	do_sparc64_fault		! Call fault handler
+	 add	%sp, PTREGS_OFF, %o0		! Compute pt_regs arg
+	ba,pt	%xcc, rtrap_clr_l6		! Restore cpu state
+	 nop					! Delay slot (fill me)
+
+winfix_trampoline:
+	rdpr	%tpc, %g3			! Prepare winfixup TNPC
+	or	%g3, 0x7c, %g3			! Compute branch offset
+	wrpr	%g3, %tnpc			! Write it into TNPC
+	done					! Trap return
+
+	/* Insert an entry into the TSB.
+	 *
+	 * %o0: TSB entry pointer (virt or phys address)
+	 * %o1: tag
+	 * %o2:	pte
+	 */
+	.align	32
+	.globl	__tsb_insert
+__tsb_insert:
+	rdpr	%pstate, %o5
+	wrpr	%o5, PSTATE_IE, %pstate
+	TSB_LOCK_TAG(%o0, %g2, %g3)
+	TSB_WRITE(%o0, %o2, %o1)
+	wrpr	%o5, %pstate
+	retl
+	 nop
+	.size	__tsb_insert, .-__tsb_insert
+
+	/* Flush the given TSB entry if it has the matching
+	 * tag.
+	 *
+	 * %o0: TSB entry pointer (virt or phys address)
+	 * %o1:	tag
+	 */
+	.align	32
+	.globl	tsb_flush
+	.type	tsb_flush,#function
+tsb_flush:
+	sethi	%hi(TSB_TAG_LOCK_HIGH), %g2
+1:	TSB_LOAD_TAG(%o0, %g1)
+	srlx	%g1, 32, %o3
+	andcc	%o3, %g2, %g0
+	bne,pn	%icc, 1b
+	 membar	#LoadLoad
+	cmp	%g1, %o1
+	mov	1, %o3
+	bne,pt	%xcc, 2f
+	 sllx	%o3, TSB_TAG_INVALID_BIT, %o3
+	TSB_CAS_TAG(%o0, %g1, %o3)
+	cmp	%g1, %o3
+	bne,pn	%xcc, 1b
+	 nop
+2:	retl
+	 TSB_MEMBAR
+	.size	tsb_flush, .-tsb_flush
+
+	/* Reload MMU related context switch state at
+	 * schedule() time.
+	 *
+	 * %o0: page table physical address
+	 * %o1:	TSB base config pointer
+	 * %o2:	TSB huge config pointer, or NULL if none
+	 * %o3:	Hypervisor TSB descriptor physical address
+	 *
+	 * We have to run this whole thing with interrupts
+	 * disabled so that the current cpu doesn't change
+	 * due to preemption.
+	 */
+	.align	32
+	.globl	__tsb_context_switch
+	.type	__tsb_context_switch,#function
+__tsb_context_switch:
+	rdpr	%pstate, %g1
+	wrpr	%g1, PSTATE_IE, %pstate
+
+	TRAP_LOAD_TRAP_BLOCK(%g2, %g3)
+
+	stx	%o0, [%g2 + TRAP_PER_CPU_PGD_PADDR]
+
+	ldx	[%o1 + TSB_CONFIG_REG_VAL], %o0
+	brz,pt	%o2, 1f
+	 mov	-1, %g3
+
+	ldx	[%o2 + TSB_CONFIG_REG_VAL], %g3
+
+1:	stx	%g3, [%g2 + TRAP_PER_CPU_TSB_HUGE]
+
+	sethi	%hi(tlb_type), %g2
+	lduw	[%g2 + %lo(tlb_type)], %g2
+	cmp	%g2, 3
+	bne,pt	%icc, 50f
+	 nop
+
+	/* Hypervisor TSB switch. */
+	mov	SCRATCHPAD_UTSBREG1, %o5
+	stxa	%o0, [%o5] ASI_SCRATCHPAD
+	mov	SCRATCHPAD_UTSBREG2, %o5
+	stxa	%g3, [%o5] ASI_SCRATCHPAD
+
+	mov	2, %o0
+	cmp	%g3, -1
+	move	%xcc, 1, %o0
+
+	mov	HV_FAST_MMU_TSB_CTXNON0, %o5
+	mov	%o3, %o1
+	ta	HV_FAST_TRAP
+
+	/* Finish up.  */
+	ba,pt	%xcc, 9f
+	 nop
+
+	/* SUN4U TSB switch.  */
+50:	mov	TSB_REG, %o5
+	stxa	%o0, [%o5] ASI_DMMU
+	membar	#Sync
+	stxa	%o0, [%o5] ASI_IMMU
+	membar	#Sync
+
+2:	ldx	[%o1 + TSB_CONFIG_MAP_VADDR], %o4
+	brz	%o4, 9f
+	 ldx	[%o1 + TSB_CONFIG_MAP_PTE], %o5
+
+	sethi	%hi(sparc64_highest_unlocked_tlb_ent), %g2
+	mov	TLB_TAG_ACCESS, %g3
+	lduw	[%g2 + %lo(sparc64_highest_unlocked_tlb_ent)], %g2
+	stxa	%o4, [%g3] ASI_DMMU
+	membar	#Sync
+	sllx	%g2, 3, %g2
+	stxa	%o5, [%g2] ASI_DTLB_DATA_ACCESS
+	membar	#Sync
+
+	brz,pt	%o2, 9f
+	 nop
+
+	ldx	[%o2 + TSB_CONFIG_MAP_VADDR], %o4
+	ldx	[%o2 + TSB_CONFIG_MAP_PTE], %o5
+	mov	TLB_TAG_ACCESS, %g3
+	stxa	%o4, [%g3] ASI_DMMU
+	membar	#Sync
+	sub	%g2, (1 << 3), %g2
+	stxa	%o5, [%g2] ASI_DTLB_DATA_ACCESS
+	membar	#Sync
+
+9:
+	wrpr	%g1, %pstate
+
+	retl
+	 nop
+	.size	__tsb_context_switch, .-__tsb_context_switch
+
+#define TSB_PASS_BITS	((1 << TSB_TAG_LOCK_BIT) | \
+			 (1 << TSB_TAG_INVALID_BIT))
+
+	.align	32
+	.globl	copy_tsb
+	.type	copy_tsb,#function
+copy_tsb:		/* %o0=old_tsb_base, %o1=old_tsb_size
+			 * %o2=new_tsb_base, %o3=new_tsb_size
+			 */
+	sethi		%uhi(TSB_PASS_BITS), %g7
+	srlx		%o3, 4, %o3
+	add		%o0, %o1, %g1	/* end of old tsb */
+	sllx		%g7, 32, %g7
+	sub		%o3, 1, %o3	/* %o3 == new tsb hash mask */
+
+661:	prefetcha	[%o0] ASI_N, #one_read
+	.section	.tsb_phys_patch, "ax"
+	.word		661b
+	prefetcha	[%o0] ASI_PHYS_USE_EC, #one_read
+	.previous
+
+90:	andcc		%o0, (64 - 1), %g0
+	bne		1f
+	 add		%o0, 64, %o5
+
+661:	prefetcha	[%o5] ASI_N, #one_read
+	.section	.tsb_phys_patch, "ax"
+	.word		661b
+	prefetcha	[%o5] ASI_PHYS_USE_EC, #one_read
+	.previous
+
+1:	TSB_LOAD_QUAD(%o0, %g2)		/* %g2/%g3 == TSB entry */
+	andcc		%g2, %g7, %g0	/* LOCK or INVALID set? */
+	bne,pn		%xcc, 80f	/* Skip it */
+	 sllx		%g2, 22, %o4	/* TAG --> VADDR */
+
+	/* This can definitely be computed faster... */
+	srlx		%o0, 4, %o5	/* Build index */
+	and		%o5, 511, %o5	/* Mask index */
+	sllx		%o5, PAGE_SHIFT, %o5 /* Put into vaddr position */
+	or		%o4, %o5, %o4	/* Full VADDR. */
+	srlx		%o4, PAGE_SHIFT, %o4 /* Shift down to create index */
+	and		%o4, %o3, %o4	/* Mask with new_tsb_nents-1 */
+	sllx		%o4, 4, %o4	/* Shift back up into tsb ent offset */
+	TSB_STORE(%o2 + %o4, %g2)	/* Store TAG */
+	add		%o4, 0x8, %o4	/* Advance to TTE */
+	TSB_STORE(%o2 + %o4, %g3)	/* Store TTE */
+
+80:	add		%o0, 16, %o0
+	cmp		%o0, %g1
+	bne,pt		%xcc, 90b
+	 nop
+
+	retl
+	 TSB_MEMBAR
+	.size		copy_tsb, .-copy_tsb
+
+	/* Set the invalid bit in all TSB entries.  */
+	.align		32
+	.globl		tsb_init
+	.type		tsb_init,#function
+tsb_init:		/* %o0 = TSB vaddr, %o1 = size in bytes */
+	prefetch	[%o0 + 0x000], #n_writes
+	mov		1, %g1
+	prefetch	[%o0 + 0x040], #n_writes
+	sllx		%g1, TSB_TAG_INVALID_BIT, %g1
+	prefetch	[%o0 + 0x080], #n_writes
+1:	prefetch	[%o0 + 0x0c0], #n_writes
+	stx		%g1, [%o0 + 0x00]
+	stx		%g1, [%o0 + 0x10]
+	stx		%g1, [%o0 + 0x20]
+	stx		%g1, [%o0 + 0x30]
+	prefetch	[%o0 + 0x100], #n_writes
+	stx		%g1, [%o0 + 0x40]
+	stx		%g1, [%o0 + 0x50]
+	stx		%g1, [%o0 + 0x60]
+	stx		%g1, [%o0 + 0x70]
+	prefetch	[%o0 + 0x140], #n_writes
+	stx		%g1, [%o0 + 0x80]
+	stx		%g1, [%o0 + 0x90]
+	stx		%g1, [%o0 + 0xa0]
+	stx		%g1, [%o0 + 0xb0]
+	prefetch	[%o0 + 0x180], #n_writes
+	stx		%g1, [%o0 + 0xc0]
+	stx		%g1, [%o0 + 0xd0]
+	stx		%g1, [%o0 + 0xe0]
+	stx		%g1, [%o0 + 0xf0]
+	subcc		%o1, 0x100, %o1
+	bne,pt		%xcc, 1b
+	 add		%o0, 0x100, %o0
+	retl
+	 nop
+	nop
+	nop
+	.size		tsb_init, .-tsb_init
+
+	.globl		NGtsb_init
+	.type		NGtsb_init,#function
+NGtsb_init:
+	rd		%asi, %g2
+	mov		1, %g1
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	sllx		%g1, TSB_TAG_INVALID_BIT, %g1
+1:	stxa		%g1, [%o0 + 0x00] %asi
+	stxa		%g1, [%o0 + 0x10] %asi
+	stxa		%g1, [%o0 + 0x20] %asi
+	stxa		%g1, [%o0 + 0x30] %asi
+	stxa		%g1, [%o0 + 0x40] %asi
+	stxa		%g1, [%o0 + 0x50] %asi
+	stxa		%g1, [%o0 + 0x60] %asi
+	stxa		%g1, [%o0 + 0x70] %asi
+	stxa		%g1, [%o0 + 0x80] %asi
+	stxa		%g1, [%o0 + 0x90] %asi
+	stxa		%g1, [%o0 + 0xa0] %asi
+	stxa		%g1, [%o0 + 0xb0] %asi
+	stxa		%g1, [%o0 + 0xc0] %asi
+	stxa		%g1, [%o0 + 0xd0] %asi
+	stxa		%g1, [%o0 + 0xe0] %asi
+	stxa		%g1, [%o0 + 0xf0] %asi
+	subcc		%o1, 0x100, %o1
+	bne,pt		%xcc, 1b
+	 add		%o0, 0x100, %o0
+	retl
+	 wr		%g2, 0x0, %asi
+	.size		NGtsb_init, .-NGtsb_init
diff --git a/arch/sparc64/kernel/ttable.S b/arch/sparc64/kernel/ttable.S
index 8365bc1f81f3..5d901519db55 100644
--- a/arch/sparc64/kernel/ttable.S
+++ b/arch/sparc64/kernel/ttable.S
@@ -1,7 +1,6 @@
-/* $Id: ttable.S,v 1.38 2002/02/09 19:49:30 davem Exp $
- * ttable.S: Sparc V9 Trap Table(s) with SpitFire/Cheetah extensions.
+/* ttable.S: Sparc V9 Trap Table(s) with SpitFire/Cheetah/SUN4V extensions.
  *
- * Copyright (C) 1996, 2001 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996, 2001, 2006 David S. Miller (davem@davemloft.net)
  */
 
 #include <linux/config.h>
@@ -19,7 +18,7 @@ tl0_resv000:	BOOT_KERNEL BTRAP(0x1) BTRAP(0x2) BTRAP(0x3)
 tl0_resv004:	BTRAP(0x4)  BTRAP(0x5) BTRAP(0x6) BTRAP(0x7)
 tl0_iax:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_insn_access_exception)
-tl0_resv009:	BTRAP(0x9)
+tl0_itsb_4v:	SUN4V_ITSB_MISS
 tl0_iae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl0_resv00b:	BTRAP(0xb) BTRAP(0xc) BTRAP(0xd) BTRAP(0xe) BTRAP(0xf)
@@ -38,7 +37,7 @@ tl0_div0:	TRAP(do_div0)
 tl0_resv029:	BTRAP(0x29) BTRAP(0x2a) BTRAP(0x2b) BTRAP(0x2c) BTRAP(0x2d) BTRAP(0x2e)
 tl0_resv02f:	BTRAP(0x2f)
 tl0_dax:	TRAP_NOSAVE(__spitfire_data_access_exception)
-tl0_resv031:	BTRAP(0x31)
+tl0_dtsb_4v:	SUN4V_DTSB_MISS
 tl0_dae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl0_resv033:	BTRAP(0x33)
@@ -52,12 +51,13 @@ tl0_resv03e:	BTRAP(0x3e) BTRAP(0x3f) BTRAP(0x40)
 tl0_irq1:	TRAP_IRQ(smp_call_function_client, 1)
 tl0_irq2:	TRAP_IRQ(smp_receive_signal_client, 2)
 tl0_irq3:	TRAP_IRQ(smp_penguin_jailcell, 3)
+tl0_irq4:	TRAP_IRQ(smp_new_mmu_context_version_client, 4)
 #else
 tl0_irq1:	BTRAP(0x41)
 tl0_irq2:	BTRAP(0x42)
 tl0_irq3:	BTRAP(0x43)
+tl0_irq4:	BTRAP(0x44)
 #endif
-tl0_irq4:	TRAP_IRQ(handler_irq, 4)
 tl0_irq5:	TRAP_IRQ(handler_irq, 5)  TRAP_IRQ(handler_irq, 6)
 tl0_irq7:	TRAP_IRQ(handler_irq, 7)  TRAP_IRQ(handler_irq, 8)
 tl0_irq9:	TRAP_IRQ(handler_irq, 9)  TRAP_IRQ(handler_irq, 10)
@@ -78,9 +78,9 @@ tl0_vaw:	TRAP(do_vaw)
 tl0_cee:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_cee_trap)
 tl0_iamiss:
-#include	"itlb_base.S"
+#include	"itlb_miss.S"
 tl0_damiss:
-#include	"dtlb_base.S"
+#include	"dtlb_miss.S"
 tl0_daprot:
 #include	"dtlb_prot.S"
 tl0_fecc:	BTRAP(0x70)	/* Fast-ECC on Cheetah */
@@ -88,15 +88,18 @@ tl0_dcpe:	BTRAP(0x71)	/* D-cache Parity Error on Cheetah+ */
 tl0_icpe:	BTRAP(0x72)	/* I-cache Parity Error on Cheetah+ */
 tl0_resv073:	BTRAP(0x73) BTRAP(0x74) BTRAP(0x75)
 tl0_resv076:	BTRAP(0x76) BTRAP(0x77) BTRAP(0x78) BTRAP(0x79) BTRAP(0x7a) BTRAP(0x7b)
-tl0_resv07c:	BTRAP(0x7c) BTRAP(0x7d) BTRAP(0x7e) BTRAP(0x7f)
+tl0_cpu_mondo:	TRAP_NOSAVE(sun4v_cpu_mondo)
+tl0_dev_mondo:	TRAP_NOSAVE(sun4v_dev_mondo)
+tl0_res_mondo:	TRAP_NOSAVE(sun4v_res_mondo)
+tl0_nres_mondo:	TRAP_NOSAVE(sun4v_nonres_mondo)
 tl0_s0n:	SPILL_0_NORMAL
 tl0_s1n:	SPILL_1_NORMAL
 tl0_s2n:	SPILL_2_NORMAL
-tl0_s3n:	SPILL_3_NORMAL
-tl0_s4n:	SPILL_4_NORMAL
-tl0_s5n:	SPILL_5_NORMAL
-tl0_s6n:	SPILL_6_NORMAL
-tl0_s7n:	SPILL_7_NORMAL
+tl0_s3n:	SPILL_0_NORMAL_ETRAP
+tl0_s4n:	SPILL_1_GENERIC_ETRAP
+tl0_s5n:	SPILL_1_GENERIC_ETRAP_FIXUP
+tl0_s6n:	SPILL_2_GENERIC_ETRAP
+tl0_s7n:	SPILL_2_GENERIC_ETRAP_FIXUP
 tl0_s0o:	SPILL_0_OTHER
 tl0_s1o:	SPILL_1_OTHER
 tl0_s2o:	SPILL_2_OTHER
@@ -110,9 +113,9 @@ tl0_f1n:	FILL_1_NORMAL
 tl0_f2n:	FILL_2_NORMAL
 tl0_f3n:	FILL_3_NORMAL
 tl0_f4n:	FILL_4_NORMAL
-tl0_f5n:	FILL_5_NORMAL
-tl0_f6n:	FILL_6_NORMAL
-tl0_f7n:	FILL_7_NORMAL
+tl0_f5n:	FILL_0_NORMAL_RTRAP
+tl0_f6n:	FILL_1_GENERIC_RTRAP
+tl0_f7n:	FILL_2_GENERIC_RTRAP
 tl0_f0o:	FILL_0_OTHER
 tl0_f1o:	FILL_1_OTHER
 tl0_f2o:	FILL_2_OTHER
@@ -128,7 +131,7 @@ tl0_flushw:	FLUSH_WINDOW_TRAP
 tl0_resv104:	BTRAP(0x104) BTRAP(0x105) BTRAP(0x106) BTRAP(0x107)
 		.globl tl0_solaris
 tl0_solaris:	SOLARIS_SYSCALL_TRAP
-tl0_netbsd:	NETBSD_SYSCALL_TRAP
+tl0_resv109:	BTRAP(0x109)
 tl0_resv10a:	BTRAP(0x10a) BTRAP(0x10b) BTRAP(0x10c) BTRAP(0x10d) BTRAP(0x10e)
 tl0_resv10f:	BTRAP(0x10f)
 tl0_linux32:	LINUX_32BIT_SYSCALL_TRAP
@@ -179,7 +182,7 @@ sparc64_ttable_tl1:
 tl1_resv000:	BOOT_KERNEL    BTRAPTL1(0x1) BTRAPTL1(0x2) BTRAPTL1(0x3)
 tl1_resv004:	BTRAPTL1(0x4)  BTRAPTL1(0x5) BTRAPTL1(0x6) BTRAPTL1(0x7)
 tl1_iax:	TRAP_NOSAVE(__spitfire_insn_access_exception_tl1)
-tl1_resv009:	BTRAPTL1(0x9)
+tl1_itsb_4v:	SUN4V_ITSB_MISS
 tl1_iae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl1_resv00b:	BTRAPTL1(0xb) BTRAPTL1(0xc) BTRAPTL1(0xd) BTRAPTL1(0xe) BTRAPTL1(0xf)
@@ -198,7 +201,7 @@ tl1_div0:	TRAPTL1(do_div0_tl1)
 tl1_resv029:	BTRAPTL1(0x29) BTRAPTL1(0x2a) BTRAPTL1(0x2b) BTRAPTL1(0x2c)
 tl1_resv02d:	BTRAPTL1(0x2d) BTRAPTL1(0x2e) BTRAPTL1(0x2f)
 tl1_dax:	TRAP_NOSAVE(__spitfire_data_access_exception_tl1)
-tl1_resv031:	BTRAPTL1(0x31)
+tl1_dtsb_4v:	SUN4V_DTSB_MISS
 tl1_dae:	membar #Sync
 		TRAP_NOSAVE_7INSNS(__spitfire_access_error)
 tl1_resv033:	BTRAPTL1(0x33)
@@ -222,26 +225,10 @@ tl1_resv05c:	BTRAPTL1(0x5c) BTRAPTL1(0x5d) BTRAPTL1(0x5e) BTRAPTL1(0x5f)
 tl1_ivec:	TRAP_IVEC
 tl1_paw:	TRAPTL1(do_paw_tl1)
 tl1_vaw:	TRAPTL1(do_vaw_tl1)
-
-		/* The grotty trick to save %g1 into current->thread.cee_stuff
-		 * is because when we take this trap we could be interrupting
-		 * trap code already using the trap alternate global registers.
-		 *
-		 * We cross our fingers and pray that this store/load does
-		 * not cause yet another CEE trap.
-		 */
-tl1_cee:	membar	#Sync
-		stx	%g1, [%g6 + TI_CEE_STUFF]
-		ldxa	[%g0] ASI_AFSR, %g1
-		membar	#Sync
-		stxa	%g1, [%g0] ASI_AFSR
-		membar	#Sync
-		ldx	[%g6 + TI_CEE_STUFF], %g1
-		retry
-
+tl1_cee:	BTRAPTL1(0x63)
 tl1_iamiss:	BTRAPTL1(0x64) BTRAPTL1(0x65) BTRAPTL1(0x66) BTRAPTL1(0x67)
 tl1_damiss:
-#include	"dtlb_backend.S"
+#include	"dtlb_miss.S"
 tl1_daprot:
 #include	"dtlb_prot.S"
 tl1_fecc:	BTRAPTL1(0x70)	/* Fast-ECC on Cheetah */
diff --git a/arch/sparc64/kernel/unaligned.c b/arch/sparc64/kernel/unaligned.c
index 70faf630603b..001e8518331f 100644
--- a/arch/sparc64/kernel/unaligned.c
+++ b/arch/sparc64/kernel/unaligned.c
@@ -277,7 +277,7 @@ static void kernel_mna_trap_fault(void)
 	regs->tstate |= (ASI_AIUS << 24UL);
 }
 
-asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn, unsigned long sfar, unsigned long sfsr)
+asmlinkage void kernel_unaligned_trap(struct pt_regs *regs, unsigned int insn)
 {
 	enum direction dir = decode_direction(insn);
 	int size = decode_access_size(insn);
@@ -405,6 +405,9 @@ extern void do_privact(struct pt_regs *regs);
 extern void spitfire_data_access_exception(struct pt_regs *regs,
 					   unsigned long sfsr,
 					   unsigned long sfar);
+extern void sun4v_data_access_exception(struct pt_regs *regs,
+					unsigned long addr,
+					unsigned long type_ctx);
 
 int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 {
@@ -447,14 +450,20 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 				break;
 			}
 		default:
-			spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 			return 1;
 		}
 		if (put_user (first >> 32, (u32 __user *)addr) ||
 		    __put_user ((u32)first, (u32 __user *)(addr + 4)) ||
 		    __put_user (second >> 32, (u32 __user *)(addr + 8)) ||
 		    __put_user ((u32)second, (u32 __user *)(addr + 12))) {
-		    	spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 		    	return 1;
 		}
 	} else {
@@ -467,7 +476,10 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 			do_privact(regs);
 			return 1;
 		} else if (asi > ASI_SNFL) {
-			spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 			return 1;
 		}
 		switch (insn & 0x180000) {
@@ -484,7 +496,10 @@ int handle_ldf_stq(u32 insn, struct pt_regs *regs)
 				err |= __get_user (data[i], (u32 __user *)(addr + 4*i));
 		}
 		if (err && !(asi & 0x2 /* NF */)) {
-			spitfire_data_access_exception(regs, 0, addr);
+			if (tlb_type == hypervisor)
+				sun4v_data_access_exception(regs, addr, 0);
+			else
+				spitfire_data_access_exception(regs, 0, addr);
 			return 1;
 		}
 		if (asi & 0x8) /* Little */ {
@@ -548,7 +563,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	u32 insn;
 	u32 first, second;
 	u64 value;
-	u8 asi, freg;
+	u8 freg;
 	int flag;
 	struct fpustate *f = FPUSTATE;
 
@@ -557,7 +572,7 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
-		asi = sfsr >> 16;
+		int asi = decode_asi(insn, regs);
 		if ((asi > ASI_SNFL) ||
 		    (asi < ASI_P))
 			goto daex;
@@ -587,7 +602,11 @@ void handle_lddfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 		*(u64 *)(f->regs + freg) = value;
 		current_thread_info()->fpsaved[0] |= flag;
 	} else {
-daex:		spitfire_data_access_exception(regs, sfsr, sfar);
+daex:
+		if (tlb_type == hypervisor)
+			sun4v_data_access_exception(regs, sfar, sfsr);
+		else
+			spitfire_data_access_exception(regs, sfsr, sfar);
 		return;
 	}
 	advance(regs);
@@ -600,7 +619,7 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	unsigned long tstate = regs->tstate;
 	u32 insn;
 	u64 value;
-	u8 asi, freg;
+	u8 freg;
 	int flag;
 	struct fpustate *f = FPUSTATE;
 
@@ -609,8 +628,8 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 	if (test_thread_flag(TIF_32BIT))
 		pc = (u32)pc;
 	if (get_user(insn, (u32 __user *) pc) != -EFAULT) {
+		int asi = decode_asi(insn, regs);
 		freg = ((insn >> 25) & 0x1e) | ((insn >> 20) & 0x20);
-		asi = sfsr >> 16;
 		value = 0;
 		flag = (freg < 32) ? FPRS_DL : FPRS_DU;
 		if ((asi > ASI_SNFL) ||
@@ -631,7 +650,11 @@ void handle_stdfmna(struct pt_regs *regs, unsigned long sfar, unsigned long sfsr
 		    __put_user ((u32)value, (u32 __user *)(sfar + 4)))
 			goto daex;
 	} else {
-daex:		spitfire_data_access_exception(regs, sfsr, sfar);
+daex:
+		if (tlb_type == hypervisor)
+			sun4v_data_access_exception(regs, sfar, sfsr);
+		else
+			spitfire_data_access_exception(regs, sfsr, sfar);
 		return;
 	}
 	advance(regs);
diff --git a/arch/sparc64/kernel/us2e_cpufreq.c b/arch/sparc64/kernel/us2e_cpufreq.c
index b35dc8dc995a..1f83fe6a82d6 100644
--- a/arch/sparc64/kernel/us2e_cpufreq.c
+++ b/arch/sparc64/kernel/us2e_cpufreq.c
@@ -346,6 +346,9 @@ static int __init us2e_freq_init(void)
 	unsigned long manuf, impl, ver;
 	int ret;
 
+	if (tlb_type != spitfire)
+		return -ENODEV;
+
 	__asm__("rdpr %%ver, %0" : "=r" (ver));
 	manuf = ((ver >> 48) & 0xffff);
 	impl  = ((ver >> 32) & 0xffff);
@@ -354,20 +357,16 @@ static int __init us2e_freq_init(void)
 		struct cpufreq_driver *driver;
 
 		ret = -ENOMEM;
-		driver = kmalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
+		driver = kzalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
 		if (!driver)
 			goto err_out;
-		memset(driver, 0, sizeof(*driver));
 
-		us2e_freq_table = kmalloc(
+		us2e_freq_table = kzalloc(
 			(NR_CPUS * sizeof(struct us2e_freq_percpu_info)),
 			GFP_KERNEL);
 		if (!us2e_freq_table)
 			goto err_out;
 
-		memset(us2e_freq_table, 0,
-		       (NR_CPUS * sizeof(struct us2e_freq_percpu_info)));
-
 		driver->init = us2e_freq_cpu_init;
 		driver->verify = us2e_freq_verify;
 		driver->target = us2e_freq_target;
diff --git a/arch/sparc64/kernel/us3_cpufreq.c b/arch/sparc64/kernel/us3_cpufreq.c
index 6d1f9a3c464f..47e3acafb5be 100644
--- a/arch/sparc64/kernel/us3_cpufreq.c
+++ b/arch/sparc64/kernel/us3_cpufreq.c
@@ -203,6 +203,9 @@ static int __init us3_freq_init(void)
 	unsigned long manuf, impl, ver;
 	int ret;
 
+	if (tlb_type != cheetah && tlb_type != cheetah_plus)
+		return -ENODEV;
+
 	__asm__("rdpr %%ver, %0" : "=r" (ver));
 	manuf = ((ver >> 48) & 0xffff);
 	impl  = ((ver >> 32) & 0xffff);
@@ -215,20 +218,16 @@ static int __init us3_freq_init(void)
 		struct cpufreq_driver *driver;
 
 		ret = -ENOMEM;
-		driver = kmalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
+		driver = kzalloc(sizeof(struct cpufreq_driver), GFP_KERNEL);
 		if (!driver)
 			goto err_out;
-		memset(driver, 0, sizeof(*driver));
 
-		us3_freq_table = kmalloc(
+		us3_freq_table = kzalloc(
 			(NR_CPUS * sizeof(struct us3_freq_percpu_info)),
 			GFP_KERNEL);
 		if (!us3_freq_table)
 			goto err_out;
 
-		memset(us3_freq_table, 0,
-		       (NR_CPUS * sizeof(struct us3_freq_percpu_info)));
-
 		driver->init = us3_freq_cpu_init;
 		driver->verify = us3_freq_verify;
 		driver->target = us3_freq_target;
diff --git a/arch/sparc64/kernel/visemul.c b/arch/sparc64/kernel/visemul.c
new file mode 100644
index 000000000000..84fedaa38aae
--- /dev/null
+++ b/arch/sparc64/kernel/visemul.c
@@ -0,0 +1,894 @@
+/* visemul.c: Emulation of VIS instructions.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/thread_info.h>
+
+#include <asm/ptrace.h>
+#include <asm/pstate.h>
+#include <asm/system.h>
+#include <asm/fpumacro.h>
+#include <asm/uaccess.h>
+
+/* OPF field of various VIS instructions.  */
+
+/* 000111011 - four 16-bit packs  */
+#define FPACK16_OPF	0x03b
+
+/* 000111010 - two 32-bit packs  */
+#define FPACK32_OPF	0x03a
+
+/* 000111101 - four 16-bit packs  */
+#define FPACKFIX_OPF	0x03d
+
+/* 001001101 - four 16-bit expands  */
+#define FEXPAND_OPF	0x04d
+
+/* 001001011 - two 32-bit merges */
+#define FPMERGE_OPF	0x04b
+
+/* 000110001 - 8-by-16-bit partitoned product  */
+#define FMUL8x16_OPF	0x031
+
+/* 000110011 - 8-by-16-bit upper alpha partitioned product  */
+#define FMUL8x16AU_OPF	0x033
+
+/* 000110101 - 8-by-16-bit lower alpha partitioned product  */
+#define FMUL8x16AL_OPF	0x035
+
+/* 000110110 - upper 8-by-16-bit partitioned product  */
+#define FMUL8SUx16_OPF	0x036
+
+/* 000110111 - lower 8-by-16-bit partitioned product  */
+#define FMUL8ULx16_OPF	0x037
+
+/* 000111000 - upper 8-by-16-bit partitioned product  */
+#define FMULD8SUx16_OPF	0x038
+
+/* 000111001 - lower unsigned 8-by-16-bit partitioned product  */
+#define FMULD8ULx16_OPF	0x039
+
+/* 000101000 - four 16-bit compare; set rd if src1 > src2  */
+#define FCMPGT16_OPF	0x028
+
+/* 000101100 - two 32-bit compare; set rd if src1 > src2  */
+#define FCMPGT32_OPF	0x02c
+
+/* 000100000 - four 16-bit compare; set rd if src1 <= src2  */
+#define FCMPLE16_OPF	0x020
+
+/* 000100100 - two 32-bit compare; set rd if src1 <= src2  */
+#define FCMPLE32_OPF	0x024
+
+/* 000100010 - four 16-bit compare; set rd if src1 != src2  */
+#define FCMPNE16_OPF	0x022
+
+/* 000100110 - two 32-bit compare; set rd if src1 != src2  */
+#define FCMPNE32_OPF	0x026
+
+/* 000101010 - four 16-bit compare; set rd if src1 == src2  */
+#define FCMPEQ16_OPF	0x02a
+
+/* 000101110 - two 32-bit compare; set rd if src1 == src2  */
+#define FCMPEQ32_OPF	0x02e
+
+/* 000000000 - Eight 8-bit edge boundary processing  */
+#define EDGE8_OPF	0x000
+
+/* 000000001 - Eight 8-bit edge boundary processing, no CC */
+#define EDGE8N_OPF	0x001
+
+/* 000000010 - Eight 8-bit edge boundary processing, little-endian  */
+#define EDGE8L_OPF	0x002
+
+/* 000000011 - Eight 8-bit edge boundary processing, little-endian, no CC  */
+#define EDGE8LN_OPF	0x003
+
+/* 000000100 - Four 16-bit edge boundary processing  */
+#define EDGE16_OPF	0x004
+
+/* 000000101 - Four 16-bit edge boundary processing, no CC  */
+#define EDGE16N_OPF	0x005
+
+/* 000000110 - Four 16-bit edge boundary processing, little-endian  */
+#define EDGE16L_OPF	0x006
+
+/* 000000111 - Four 16-bit edge boundary processing, little-endian, no CC  */
+#define EDGE16LN_OPF	0x007
+
+/* 000001000 - Two 32-bit edge boundary processing  */
+#define EDGE32_OPF	0x008
+
+/* 000001001 - Two 32-bit edge boundary processing, no CC  */
+#define EDGE32N_OPF	0x009
+
+/* 000001010 - Two 32-bit edge boundary processing, little-endian  */
+#define EDGE32L_OPF	0x00a
+
+/* 000001011 - Two 32-bit edge boundary processing, little-endian, no CC  */
+#define EDGE32LN_OPF	0x00b
+
+/* 000111110 - distance between 8 8-bit components  */
+#define PDIST_OPF	0x03e
+
+/* 000010000 - convert 8-bit 3-D address to blocked byte address  */
+#define ARRAY8_OPF	0x010
+
+/* 000010010 - convert 16-bit 3-D address to blocked byte address  */
+#define ARRAY16_OPF	0x012
+
+/* 000010100 - convert 32-bit 3-D address to blocked byte address  */
+#define ARRAY32_OPF	0x014
+
+/* 000011001 - Set the GSR.MASK field in preparation for a BSHUFFLE  */
+#define BMASK_OPF	0x019
+
+/* 001001100 - Permute bytes as specified by GSR.MASK  */
+#define BSHUFFLE_OPF	0x04c
+
+#define VIS_OPCODE_MASK	((0x3 << 30) | (0x3f << 19))
+#define VIS_OPCODE_VAL	((0x2 << 30) | (0x36 << 19))
+
+#define VIS_OPF_SHIFT	5
+#define VIS_OPF_MASK	(0x1ff << VIS_OPF_SHIFT)
+
+#define RS1(INSN)	(((INSN) >> 24) & 0x1f)
+#define RS2(INSN)	(((INSN) >>  0) & 0x1f)
+#define RD(INSN)	(((INSN) >> 25) & 0x1f)
+
+static inline void maybe_flush_windows(unsigned int rs1, unsigned int rs2,
+				       unsigned int rd, int from_kernel)
+{
+	if (rs2 >= 16 || rs1 >= 16 || rd >= 16) {
+		if (from_kernel != 0)
+			__asm__ __volatile__("flushw");
+		else
+			flushw_user();
+	}
+}
+
+static unsigned long fetch_reg(unsigned int reg, struct pt_regs *regs)
+{
+	unsigned long value;
+	
+	if (reg < 16)
+		return (!reg ? 0 : regs->u_regs[reg]);
+	if (regs->tstate & TSTATE_PRIV) {
+		struct reg_window *win;
+		win = (struct reg_window *)(regs->u_regs[UREG_FP] + STACK_BIAS);
+		value = win->locals[reg - 16];
+	} else if (test_thread_flag(TIF_32BIT)) {
+		struct reg_window32 __user *win32;
+		win32 = (struct reg_window32 __user *)((unsigned long)((u32)regs->u_regs[UREG_FP]));
+		get_user(value, &win32->locals[reg - 16]);
+	} else {
+		struct reg_window __user *win;
+		win = (struct reg_window __user *)(regs->u_regs[UREG_FP] + STACK_BIAS);
+		get_user(value, &win->locals[reg - 16]);
+	}
+	return value;
+}
+
+static inline unsigned long __user *__fetch_reg_addr_user(unsigned int reg,
+							  struct pt_regs *regs)
+{
+	BUG_ON(reg < 16);
+	BUG_ON(regs->tstate & TSTATE_PRIV);
+
+	if (test_thread_flag(TIF_32BIT)) {
+		struct reg_window32 __user *win32;
+		win32 = (struct reg_window32 __user *)((unsigned long)((u32)regs->u_regs[UREG_FP]));
+		return (unsigned long __user *)&win32->locals[reg - 16];
+	} else {
+		struct reg_window __user *win;
+		win = (struct reg_window __user *)(regs->u_regs[UREG_FP] + STACK_BIAS);
+		return &win->locals[reg - 16];
+	}
+}
+
+static inline unsigned long *__fetch_reg_addr_kern(unsigned int reg,
+						   struct pt_regs *regs)
+{
+	BUG_ON(reg >= 16);
+	BUG_ON(regs->tstate & TSTATE_PRIV);
+
+	return &regs->u_regs[reg];
+}
+
+static void store_reg(struct pt_regs *regs, unsigned long val, unsigned long rd)
+{
+	if (rd < 16) {
+		unsigned long *rd_kern = __fetch_reg_addr_kern(rd, regs);
+
+		*rd_kern = val;
+	} else {
+		unsigned long __user *rd_user = __fetch_reg_addr_user(rd, regs);
+
+		if (test_thread_flag(TIF_32BIT))
+			__put_user((u32)val, (u32 __user *)rd_user);
+		else
+			__put_user(val, rd_user);
+	}
+}
+
+static inline unsigned long fpd_regval(struct fpustate *f,
+				       unsigned int insn_regnum)
+{
+	insn_regnum = (((insn_regnum & 1) << 5) |
+		       (insn_regnum & 0x1e));
+
+	return *(unsigned long *) &f->regs[insn_regnum];
+}
+
+static inline unsigned long *fpd_regaddr(struct fpustate *f,
+					 unsigned int insn_regnum)
+{
+	insn_regnum = (((insn_regnum & 1) << 5) |
+		       (insn_regnum & 0x1e));
+
+	return (unsigned long *) &f->regs[insn_regnum];
+}
+
+static inline unsigned int fps_regval(struct fpustate *f,
+				      unsigned int insn_regnum)
+{
+	return f->regs[insn_regnum];
+}
+
+static inline unsigned int *fps_regaddr(struct fpustate *f,
+					unsigned int insn_regnum)
+{
+	return &f->regs[insn_regnum];
+}
+
+struct edge_tab {
+	u16 left, right;
+};
+struct edge_tab edge8_tab[8] = {
+	{ 0xff, 0x80 },
+	{ 0x7f, 0xc0 },
+	{ 0x3f, 0xe0 },
+	{ 0x1f, 0xf0 },
+	{ 0x0f, 0xf8 },
+	{ 0x07, 0xfc },
+	{ 0x03, 0xfe },
+	{ 0x01, 0xff },
+};
+struct edge_tab edge8_tab_l[8] = {
+	{ 0xff, 0x01 },
+	{ 0xfe, 0x03 },
+	{ 0xfc, 0x07 },
+	{ 0xf8, 0x0f },
+	{ 0xf0, 0x1f },
+	{ 0xe0, 0x3f },
+	{ 0xc0, 0x7f },
+	{ 0x80, 0xff },
+};
+struct edge_tab edge16_tab[4] = {
+	{ 0xf, 0x8 },
+	{ 0x7, 0xc },
+	{ 0x3, 0xe },
+	{ 0x1, 0xf },
+};
+struct edge_tab edge16_tab_l[4] = {
+	{ 0xf, 0x1 },
+	{ 0xe, 0x3 },
+	{ 0xc, 0x7 },
+	{ 0x8, 0xf },
+};
+struct edge_tab edge32_tab[2] = {
+	{ 0x3, 0x2 },
+	{ 0x1, 0x3 },
+};
+struct edge_tab edge32_tab_l[2] = {
+	{ 0x3, 0x1 },
+	{ 0x2, 0x3 },
+};
+
+static void edge(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	unsigned long orig_rs1, rs1, orig_rs2, rs2, rd_val;
+	u16 left, right;
+
+	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
+	orig_rs1 = rs1 = fetch_reg(RS1(insn), regs);
+	orig_rs2 = rs2 = fetch_reg(RS2(insn), regs);
+
+	if (test_thread_flag(TIF_32BIT)) {
+		rs1 = rs1 & 0xffffffff;
+		rs2 = rs2 & 0xffffffff;
+	}
+	switch (opf) {
+	default:
+	case EDGE8_OPF:
+	case EDGE8N_OPF:
+		left = edge8_tab[rs1 & 0x7].left;
+		right = edge8_tab[rs2 & 0x7].right;
+		break;
+	case EDGE8L_OPF:
+	case EDGE8LN_OPF:
+		left = edge8_tab_l[rs1 & 0x7].left;
+		right = edge8_tab_l[rs2 & 0x7].right;
+		break;
+
+	case EDGE16_OPF:
+	case EDGE16N_OPF:
+		left = edge16_tab[(rs1 >> 1) & 0x3].left;
+		right = edge16_tab[(rs2 >> 1) & 0x3].right;
+		break;
+
+	case EDGE16L_OPF:
+	case EDGE16LN_OPF:
+		left = edge16_tab_l[(rs1 >> 1) & 0x3].left;
+		right = edge16_tab_l[(rs2 >> 1) & 0x3].right;
+		break;
+
+	case EDGE32_OPF:
+	case EDGE32N_OPF:
+		left = edge32_tab[(rs1 >> 2) & 0x1].left;
+		right = edge32_tab[(rs2 >> 2) & 0x1].right;
+		break;
+
+	case EDGE32L_OPF:
+	case EDGE32LN_OPF:
+		left = edge32_tab_l[(rs1 >> 2) & 0x1].left;
+		right = edge32_tab_l[(rs2 >> 2) & 0x1].right;
+		break;
+	};
+
+	if ((rs1 & ~0x7UL) == (rs2 & ~0x7UL))
+		rd_val = right & left;
+	else
+		rd_val = left;
+
+	store_reg(regs, rd_val, RD(insn));
+
+	switch (opf) {
+	case EDGE8_OPF:
+	case EDGE8L_OPF:
+	case EDGE16_OPF:
+	case EDGE16L_OPF:
+	case EDGE32_OPF:
+	case EDGE32L_OPF: {
+		unsigned long ccr, tstate;
+
+		__asm__ __volatile__("subcc	%1, %2, %%g0\n\t"
+				     "rd	%%ccr, %0"
+				     : "=r" (ccr)
+				     : "r" (orig_rs1), "r" (orig_rs2)
+				     : "cc");
+		tstate = regs->tstate & ~(TSTATE_XCC | TSTATE_ICC);
+		regs->tstate = tstate | (ccr << 32UL);
+	}
+	};
+}
+
+static void array(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	unsigned long rs1, rs2, rd_val;
+	unsigned int bits, bits_mask;
+
+	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
+	rs1 = fetch_reg(RS1(insn), regs);
+	rs2 = fetch_reg(RS2(insn), regs);
+
+	bits = (rs2 > 5 ? 5 : rs2);
+	bits_mask = (1UL << bits) - 1UL;
+
+	rd_val = ((((rs1 >> 11) & 0x3) <<  0) |
+		  (((rs1 >> 33) & 0x3) <<  2) |
+		  (((rs1 >> 55) & 0x1) <<  4) |
+		  (((rs1 >> 13) & 0xf) <<  5) |
+		  (((rs1 >> 35) & 0xf) <<  9) |
+		  (((rs1 >> 56) & 0xf) << 13) |
+		  (((rs1 >> 17) & bits_mask) << 17) |
+		  (((rs1 >> 39) & bits_mask) << (17 + bits)) |
+		  (((rs1 >> 60) & 0xf)       << (17 + (2*bits))));
+
+	switch (opf) {
+	case ARRAY16_OPF:
+		rd_val <<= 1;
+		break;
+
+	case ARRAY32_OPF:
+		rd_val <<= 2;
+	};
+
+	store_reg(regs, rd_val, RD(insn));
+}
+
+static void bmask(struct pt_regs *regs, unsigned int insn)
+{
+	unsigned long rs1, rs2, rd_val, gsr;
+
+	maybe_flush_windows(RS1(insn), RS2(insn), RD(insn), 0);
+	rs1 = fetch_reg(RS1(insn), regs);
+	rs2 = fetch_reg(RS2(insn), regs);
+	rd_val = rs1 + rs2;
+
+	store_reg(regs, rd_val, RD(insn));
+
+	gsr = current_thread_info()->gsr[0] & 0xffffffff;
+	gsr |= rd_val << 32UL;
+	current_thread_info()->gsr[0] = gsr;
+}
+
+static void bshuffle(struct pt_regs *regs, unsigned int insn)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, rd_val;
+	unsigned long bmask, i;
+
+	bmask = current_thread_info()->gsr[0] >> 32UL;
+
+	rs1 = fpd_regval(f, RS1(insn));
+	rs2 = fpd_regval(f, RS2(insn));
+
+	rd_val = 0UL;
+	for (i = 0; i < 8; i++) {
+		unsigned long which = (bmask >> (i * 4)) & 0xf;
+		unsigned long byte;
+
+		if (which < 8)
+			byte = (rs1 >> (which * 8)) & 0xff;
+		else
+			byte = (rs2 >> ((which-8)*8)) & 0xff;
+		rd_val |= (byte << (i * 8));
+	}
+
+	*fpd_regaddr(f, RD(insn)) = rd_val;
+}
+
+static void pdist(struct pt_regs *regs, unsigned int insn)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, *rd, rd_val;
+	unsigned long i;
+
+	rs1 = fpd_regval(f, RS1(insn));
+	rs2 = fpd_regval(f, RS1(insn));
+	rd = fpd_regaddr(f, RD(insn));
+
+	rd_val = *rd;
+
+	for (i = 0; i < 8; i++) {
+		s16 s1, s2;
+
+		s1 = (rs1 >> (56 - (i * 8))) & 0xff;
+		s2 = (rs2 >> (56 - (i * 8))) & 0xff;
+
+		/* Absolute value of difference. */
+		s1 -= s2;
+		if (s1 < 0)
+			s1 = ~s1 + 1;
+
+		rd_val += s1;
+	}
+
+	*rd = rd_val;
+}
+
+static void pformat(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, gsr, scale, rd_val;
+
+	gsr = current_thread_info()->gsr[0];
+	scale = (gsr >> 3) & (opf == FPACK16_OPF ? 0xf : 0x1f);
+	switch (opf) {
+	case FPACK16_OPF: {
+		unsigned long byte;
+
+		rs2 = fpd_regval(f, RS2(insn));
+		rd_val = 0;
+		for (byte = 0; byte < 4; byte++) {
+			unsigned int val;
+			s16 src = (rs2 >> (byte * 16UL)) & 0xffffUL;
+			int scaled = src << scale;
+			int from_fixed = scaled >> 7;
+
+			val = ((from_fixed < 0) ?
+			       0 :
+			       (from_fixed > 255) ?
+			       255 : from_fixed);
+
+			rd_val |= (val << (8 * byte));
+		}
+		*fps_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FPACK32_OPF: {
+		unsigned long word;
+
+		rs1 = fpd_regval(f, RS1(insn));
+		rs2 = fpd_regval(f, RS2(insn));
+		rd_val = (rs1 << 8) & ~(0x000000ff000000ffUL);
+		for (word = 0; word < 2; word++) {
+			unsigned long val;
+			s32 src = (rs2 >> (word * 32UL));
+			s64 scaled = src << scale;
+			s64 from_fixed = scaled >> 23;
+
+			val = ((from_fixed < 0) ?
+			       0 :
+			       (from_fixed > 255) ?
+			       255 : from_fixed);
+
+			rd_val |= (val << (32 * word));
+		}
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FPACKFIX_OPF: {
+		unsigned long word;
+
+		rs2 = fpd_regval(f, RS2(insn));
+
+		rd_val = 0;
+		for (word = 0; word < 2; word++) {
+			long val;
+			s32 src = (rs2 >> (word * 32UL));
+			s64 scaled = src << scale;
+			s64 from_fixed = scaled >> 16;
+
+			val = ((from_fixed < -32768) ?
+			       -32768 :
+			       (from_fixed > 32767) ?
+			       32767 : from_fixed);
+
+			rd_val |= ((val & 0xffff) << (word * 16));
+		}
+		*fps_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FEXPAND_OPF: {
+		unsigned long byte;
+
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = 0;
+		for (byte = 0; byte < 4; byte++) {
+			unsigned long val;
+			u8 src = (rs2 >> (byte * 8)) & 0xff;
+
+			val = src << 4;
+
+			rd_val |= (val << (byte * 16));
+		}
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FPMERGE_OPF: {
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = (((rs2 & 0x000000ff) <<  0) |
+			  ((rs1 & 0x000000ff) <<  8) |
+			  ((rs2 & 0x0000ff00) <<  8) |
+			  ((rs1 & 0x0000ff00) << 16) |
+			  ((rs2 & 0x00ff0000) << 16) |
+			  ((rs1 & 0x00ff0000) << 24) |
+			  ((rs2 & 0xff000000) << 24) |
+			  ((rs1 & 0xff000000) << 32));
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+	};
+}
+
+static void pmul(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, rd_val;
+
+	switch (opf) {
+	case FMUL8x16_OPF: {
+		unsigned long byte;
+
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fpd_regval(f, RS2(insn));
+
+		rd_val = 0;
+		for (byte = 0; byte < 4; byte++) {
+			u16 src1 = (rs1 >> (byte *  8)) & 0x00ff;
+			s16 src2 = (rs2 >> (byte * 16)) & 0xffff;
+			u32 prod = src1 * src2;
+			u16 scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
+		}
+
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FMUL8x16AU_OPF:
+	case FMUL8x16AL_OPF: {
+		unsigned long byte;
+		s16 src2;
+
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = 0;
+		src2 = (rs2 >> (opf == FMUL8x16AU_OPF) ? 16 : 0);
+		for (byte = 0; byte < 4; byte++) {
+			u16 src1 = (rs1 >> (byte * 8)) & 0x00ff;
+			u32 prod = src1 * src2;
+			u16 scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
+		}
+
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FMUL8SUx16_OPF:
+	case FMUL8ULx16_OPF: {
+		unsigned long byte, ushift;
+
+		rs1 = fpd_regval(f, RS1(insn));
+		rs2 = fpd_regval(f, RS2(insn));
+
+		rd_val = 0;
+		ushift = (opf == FMUL8SUx16_OPF) ? 8 : 0;
+		for (byte = 0; byte < 4; byte++) {
+			u16 src1;
+			s16 src2;
+			u32 prod;
+			u16 scaled;
+
+			src1 = ((rs1 >> ((16 * byte) + ushift)) & 0x00ff);
+			src2 = ((rs2 >> (16 * byte)) & 0xffff);
+			prod = src1 * src2;
+			scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) << (byte * 16UL));
+		}
+
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+
+	case FMULD8SUx16_OPF:
+	case FMULD8ULx16_OPF: {
+		unsigned long byte, ushift;
+
+		rs1 = fps_regval(f, RS1(insn));
+		rs2 = fps_regval(f, RS2(insn));
+
+		rd_val = 0;
+		ushift = (opf == FMULD8SUx16_OPF) ? 8 : 0;
+		for (byte = 0; byte < 2; byte++) {
+			u16 src1;
+			s16 src2;
+			u32 prod;
+			u16 scaled;
+
+			src1 = ((rs1 >> ((16 * byte) + ushift)) & 0x00ff);
+			src2 = ((rs2 >> (16 * byte)) & 0xffff);
+			prod = src1 * src2;
+			scaled = ((prod & 0x00ffff00) >> 8);
+
+			/* Round up.  */
+			if (prod & 0x80)
+				scaled++;
+			rd_val |= ((scaled & 0xffffUL) <<
+				   ((byte * 32UL) + 7UL));
+		}
+		*fpd_regaddr(f, RD(insn)) = rd_val;
+		break;
+	}
+	};
+}
+
+static void pcmp(struct pt_regs *regs, unsigned int insn, unsigned int opf)
+{
+	struct fpustate *f = FPUSTATE;
+	unsigned long rs1, rs2, rd_val, i;
+
+	rs1 = fpd_regval(f, RS1(insn));
+	rs2 = fpd_regval(f, RS2(insn));
+
+	rd_val = 0;
+
+	switch (opf) {
+	case FCMPGT16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a > b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPGT32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a > b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPLE16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a <= b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPLE32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a <= b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPNE16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a != b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPNE32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a != b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPEQ16_OPF:
+		for (i = 0; i < 4; i++) {
+			s16 a = (rs1 >> (i * 16)) & 0xffff;
+			s16 b = (rs2 >> (i * 16)) & 0xffff;
+
+			if (a == b)
+				rd_val |= 1 << i;
+		}
+		break;
+
+	case FCMPEQ32_OPF:
+		for (i = 0; i < 2; i++) {
+			s32 a = (rs1 >> (i * 32)) & 0xffff;
+			s32 b = (rs2 >> (i * 32)) & 0xffff;
+
+			if (a == b)
+				rd_val |= 1 << i;
+		}
+		break;
+	};
+
+	maybe_flush_windows(0, 0, RD(insn), 0);
+	store_reg(regs, rd_val, RD(insn));
+}
+
+/* Emulate the VIS instructions which are not implemented in
+ * hardware on Niagara.
+ */
+int vis_emul(struct pt_regs *regs, unsigned int insn)
+{
+	unsigned long pc = regs->tpc;
+	unsigned int opf;
+
+	BUG_ON(regs->tstate & TSTATE_PRIV);
+
+	if (test_thread_flag(TIF_32BIT))
+		pc = (u32)pc;
+
+	if (get_user(insn, (u32 __user *) pc))
+		return -EFAULT;
+
+	if ((insn & VIS_OPCODE_MASK) != VIS_OPCODE_VAL)
+		return -EINVAL;
+
+	opf = (insn & VIS_OPF_MASK) >> VIS_OPF_SHIFT;
+	switch (opf) {
+	default:
+		return -EINVAL;
+
+	/* Pixel Formatting Instructions.  */
+	case FPACK16_OPF:
+	case FPACK32_OPF:
+	case FPACKFIX_OPF:
+	case FEXPAND_OPF:
+	case FPMERGE_OPF:
+		pformat(regs, insn, opf);
+		break;
+
+	/* Partitioned Multiply Instructions  */
+	case FMUL8x16_OPF:
+	case FMUL8x16AU_OPF:
+	case FMUL8x16AL_OPF:
+	case FMUL8SUx16_OPF:
+	case FMUL8ULx16_OPF:
+	case FMULD8SUx16_OPF:
+	case FMULD8ULx16_OPF:
+		pmul(regs, insn, opf);
+		break;
+
+	/* Pixel Compare Instructions  */
+	case FCMPGT16_OPF:
+	case FCMPGT32_OPF:
+	case FCMPLE16_OPF:
+	case FCMPLE32_OPF:
+	case FCMPNE16_OPF:
+	case FCMPNE32_OPF:
+	case FCMPEQ16_OPF:
+	case FCMPEQ32_OPF:
+		pcmp(regs, insn, opf);
+		break;
+
+	/* Edge Handling Instructions  */
+	case EDGE8_OPF:
+	case EDGE8N_OPF:
+	case EDGE8L_OPF:
+	case EDGE8LN_OPF:
+	case EDGE16_OPF:
+	case EDGE16N_OPF:
+	case EDGE16L_OPF:
+	case EDGE16LN_OPF:
+	case EDGE32_OPF:
+	case EDGE32N_OPF:
+	case EDGE32L_OPF:
+	case EDGE32LN_OPF:
+		edge(regs, insn, opf);
+		break;
+
+	/* Pixel Component Distance  */
+	case PDIST_OPF:
+		pdist(regs, insn);
+		break;
+
+	/* Three-Dimensional Array Addressing Instructions  */
+	case ARRAY8_OPF:
+	case ARRAY16_OPF:
+	case ARRAY32_OPF:
+		array(regs, insn, opf);
+		break;
+
+	/* Byte Mask and Shuffle Instructions  */
+	case BMASK_OPF:
+		bmask(regs, insn);
+		break;
+
+	case BSHUFFLE_OPF:
+		bshuffle(regs, insn);
+		break;
+	};
+
+	regs->tpc = regs->tnpc;
+	regs->tnpc += 4;
+	return 0;
+}
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index 467d13a0d5c1..b097379a49a8 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -70,6 +70,22 @@ SECTIONS
   .con_initcall.init : { *(.con_initcall.init) }
   __con_initcall_end = .;
   SECURITY_INIT
+  . = ALIGN(4);
+  __tsb_ldquad_phys_patch = .;
+  .tsb_ldquad_phys_patch : { *(.tsb_ldquad_phys_patch) }
+  __tsb_ldquad_phys_patch_end = .;
+  __tsb_phys_patch = .;
+  .tsb_phys_patch : { *(.tsb_phys_patch) }
+  __tsb_phys_patch_end = .;
+  __cpuid_patch = .;
+  .cpuid_patch : { *(.cpuid_patch) }
+  __cpuid_patch_end = .;
+  __sun4v_1insn_patch = .;
+  .sun4v_1insn_patch : { *(.sun4v_1insn_patch) }
+  __sun4v_1insn_patch_end = .;
+  __sun4v_2insn_patch = .;
+  .sun4v_2insn_patch : { *(.sun4v_2insn_patch) }
+  __sun4v_2insn_patch_end = .;
   . = ALIGN(8192); 
   __initramfs_start = .;
   .init.ramfs : { *(.init.ramfs) }
diff --git a/arch/sparc64/kernel/winfixup.S b/arch/sparc64/kernel/winfixup.S
index 39160926267b..c4aa110a10e5 100644
--- a/arch/sparc64/kernel/winfixup.S
+++ b/arch/sparc64/kernel/winfixup.S
@@ -1,8 +1,6 @@
-/* $Id: winfixup.S,v 1.30 2002/02/09 19:49:30 davem Exp $
+/* winfixup.S: Handle cases where user stack pointer is found to be bogus.
  *
- * winfixup.S: Handle cases where user stack pointer is found to be bogus.
- *
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1997, 2006 David S. Miller (davem@davemloft.net)
  */
 
 #include <asm/asi.h>
@@ -15,374 +13,144 @@
 
 	.text
 
-set_pcontext:
-	sethi	%hi(sparc64_kern_pri_context), %l1
-	ldx	[%l1 + %lo(sparc64_kern_pri_context)], %l1
-	mov	PRIMARY_CONTEXT, %g1
-	stxa	%l1, [%g1] ASI_DMMU
-	flush	%g6
-	retl
-	 nop
+	/* It used to be the case that these register window fault
+	 * handlers could run via the save and restore instructions
+	 * done by the trap entry and exit code.  They now do the
+	 * window spill/fill by hand, so that case no longer can occur.
+	 */
 
 	.align	32
-
-	/* Here are the rules, pay attention.
-	 *
-	 * The kernel is disallowed from touching user space while
-	 * the trap level is greater than zero, except for from within
-	 * the window spill/fill handlers.  This must be followed
-	 * so that we can easily detect the case where we tried to
-	 * spill/fill with a bogus (or unmapped) user stack pointer.
-	 *
-	 * These are layed out in a special way for cache reasons,
-	 * don't touch...
-	 */
-	.globl	fill_fixup, spill_fixup
 fill_fixup:
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
-	or		%g4, FAULT_CODE_WINFIXUP, %g4
-	be,pt		%xcc, window_scheisse_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-
-	/* This is the extremely complex case, but it does happen from
-	 * time to time if things are just right.  Essentially the restore
-	 * done in rtrap right before going back to user mode, with tl=1
-	 * and that levels trap stack registers all setup, took a fill trap,
-	 * the user stack was not mapped in the tlb, and tlb miss occurred,
-	 * the pte found was not valid, and a simple ref bit watch update
-	 * could not satisfy the miss, so we got here.
-	 *
-	 * We must carefully unwind the state so we get back to tl=0, preserve
-	 * all the register values we were going to give to the user.  Luckily
-	 * most things are where they need to be, we also have the address
-	 * which triggered the fault handy as well.
-	 *
-	 * Also note that we must preserve %l5 and %l6.  If the user was
-	 * returning from a system call, we must make it look this way
-	 * after we process the fill fault on the users stack.
-	 *
-	 * First, get into the window where the original restore was executed.
-	 */
-
-	rdpr		%wstate, %g2			! Grab user mode wstate.
-	wrpr		%g1, %cwp			! Get into the right window.
-	sll		%g2, 3, %g2			! NORMAL-->OTHER
-
-	wrpr		%g0, 0x0, %canrestore		! Standard etrap stuff.
-	wrpr		%g2, 0x0, %wstate		! This must be consistent.
-	wrpr		%g0, 0x0, %otherwin		! We know this.
-	call		set_pcontext			! Change contexts...
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	rdpr	%tstate, %g1
+	and	%g1, TSTATE_CWP, %g1
+	or	%g4, FAULT_CODE_WINFIXUP, %g4
+	stb	%g4, [%g6 + TI_FAULT_CODE]
+	stx	%g5, [%g6 + TI_FAULT_ADDR]
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	call	do_sparc64_fault
+	 add	%sp, PTREGS_OFF, %o0
+	ba,pt	%xcc, rtrap_clr_l6
 	 nop
-	rdpr		%pstate, %l1			! Prepare to change globals.
-	mov		%g6, %o7			! Get current.
-
-	andn		%l1, PSTATE_MM, %l1		! We want to be in RMO
-	stb		%g4, [%g6 + TI_FAULT_CODE]
-	stx		%g5, [%g6 + TI_FAULT_ADDR]
-	wrpr		%g0, 0x0, %tl			! Out of trap levels.
-	wrpr		%l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
-	mov		%o7, %g6
-	ldx		[%g6 + TI_TASK], %g4
-#ifdef CONFIG_SMP
-	mov		TSB_REG, %g1
-	ldxa		[%g1] ASI_IMMU, %g5
-#endif
 
-	/* This is the same as below, except we handle this a bit special
-	 * since we must preserve %l5 and %l6, see comment above.
-	 */
-	call		do_sparc64_fault
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 nop						! yes, nop is correct
-
-	/* Be very careful about usage of the alternate globals here.
-	 * You cannot touch %g4/%g5 as that has the fault information
-	 * should this be from usermode.  Also be careful for the case
-	 * where we get here from the save instruction in etrap.S when
-	 * coming from either user or kernel (does not matter which, it
-	 * is the same problem in both cases).  Essentially this means
-	 * do not touch %g7 or %g2 so we handle the two cases fine.
+	/* Be very careful about usage of the trap globals here.
+	 * You cannot touch %g5 as that has the fault information.
 	 */
 spill_fixup:
-	ldx		[%g6 + TI_FLAGS], %g1
-	andcc		%g1, _TIF_32BIT, %g0
-	ldub		[%g6 + TI_WSAVED], %g1
-
-	sll		%g1, 3, %g3
-	add		%g6, %g3, %g3
-	stx		%sp, [%g3 + TI_RWIN_SPTRS]
-	sll		%g1, 7, %g3
-	bne,pt		%xcc, 1f
-	 add		%g6, %g3, %g3
-	stx		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	stx		%l1, [%g3 + TI_REG_WINDOW + 0x08]
-
-	stx		%l2, [%g3 + TI_REG_WINDOW + 0x10]
-	stx		%l3, [%g3 + TI_REG_WINDOW + 0x18]
-	stx		%l4, [%g3 + TI_REG_WINDOW + 0x20]
-	stx		%l5, [%g3 + TI_REG_WINDOW + 0x28]
-	stx		%l6, [%g3 + TI_REG_WINDOW + 0x30]
-	stx		%l7, [%g3 + TI_REG_WINDOW + 0x38]
-	stx		%i0, [%g3 + TI_REG_WINDOW + 0x40]
-	stx		%i1, [%g3 + TI_REG_WINDOW + 0x48]
-
-	stx		%i2, [%g3 + TI_REG_WINDOW + 0x50]
-	stx		%i3, [%g3 + TI_REG_WINDOW + 0x58]
-	stx		%i4, [%g3 + TI_REG_WINDOW + 0x60]
-	stx		%i5, [%g3 + TI_REG_WINDOW + 0x68]
-	stx		%i6, [%g3 + TI_REG_WINDOW + 0x70]
-	b,pt		%xcc, 2f
-	 stx		%i7, [%g3 + TI_REG_WINDOW + 0x78]
-1:	stw		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-
-	stw		%l1, [%g3 + TI_REG_WINDOW + 0x04]
-	stw		%l2, [%g3 + TI_REG_WINDOW + 0x08]
-	stw		%l3, [%g3 + TI_REG_WINDOW + 0x0c]
-	stw		%l4, [%g3 + TI_REG_WINDOW + 0x10]
-	stw		%l5, [%g3 + TI_REG_WINDOW + 0x14]
-	stw		%l6, [%g3 + TI_REG_WINDOW + 0x18]
-	stw		%l7, [%g3 + TI_REG_WINDOW + 0x1c]
-	stw		%i0, [%g3 + TI_REG_WINDOW + 0x20]
-
-	stw		%i1, [%g3 + TI_REG_WINDOW + 0x24]
-	stw		%i2, [%g3 + TI_REG_WINDOW + 0x28]
-	stw		%i3, [%g3 + TI_REG_WINDOW + 0x2c]
-	stw		%i4, [%g3 + TI_REG_WINDOW + 0x30]
-	stw		%i5, [%g3 + TI_REG_WINDOW + 0x34]
-	stw		%i6, [%g3 + TI_REG_WINDOW + 0x38]
-	stw		%i7, [%g3 + TI_REG_WINDOW + 0x3c]
-2:	add		%g1, 1, %g1
-
-	stb		%g1, [%g6 + TI_WSAVED]
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
+spill_fixup_mna:
+spill_fixup_dax:
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	ldx	[%g6 + TI_FLAGS], %g1
+	andcc	%g1, _TIF_32BIT, %g0
+	ldub	[%g6 + TI_WSAVED], %g1
+	sll	%g1, 3, %g3
+	add	%g6, %g3, %g3
+	stx	%sp, [%g3 + TI_RWIN_SPTRS]
+	sll	%g1, 7, %g3
+	bne,pt	%xcc, 1f
+	 add	%g6, %g3, %g3
+	stx	%l0, [%g3 + TI_REG_WINDOW + 0x00]
+	stx	%l1, [%g3 + TI_REG_WINDOW + 0x08]
+	stx	%l2, [%g3 + TI_REG_WINDOW + 0x10]
+	stx	%l3, [%g3 + TI_REG_WINDOW + 0x18]
+	stx	%l4, [%g3 + TI_REG_WINDOW + 0x20]
+	stx	%l5, [%g3 + TI_REG_WINDOW + 0x28]
+	stx	%l6, [%g3 + TI_REG_WINDOW + 0x30]
+	stx	%l7, [%g3 + TI_REG_WINDOW + 0x38]
+	stx	%i0, [%g3 + TI_REG_WINDOW + 0x40]
+	stx	%i1, [%g3 + TI_REG_WINDOW + 0x48]
+	stx	%i2, [%g3 + TI_REG_WINDOW + 0x50]
+	stx	%i3, [%g3 + TI_REG_WINDOW + 0x58]
+	stx	%i4, [%g3 + TI_REG_WINDOW + 0x60]
+	stx	%i5, [%g3 + TI_REG_WINDOW + 0x68]
+	stx	%i6, [%g3 + TI_REG_WINDOW + 0x70]
+	ba,pt	%xcc, 2f
+	 stx	%i7, [%g3 + TI_REG_WINDOW + 0x78]
+1:	stw	%l0, [%g3 + TI_REG_WINDOW + 0x00]
+	stw	%l1, [%g3 + TI_REG_WINDOW + 0x04]
+	stw	%l2, [%g3 + TI_REG_WINDOW + 0x08]
+	stw	%l3, [%g3 + TI_REG_WINDOW + 0x0c]
+	stw	%l4, [%g3 + TI_REG_WINDOW + 0x10]
+	stw	%l5, [%g3 + TI_REG_WINDOW + 0x14]
+	stw	%l6, [%g3 + TI_REG_WINDOW + 0x18]
+	stw	%l7, [%g3 + TI_REG_WINDOW + 0x1c]
+	stw	%i0, [%g3 + TI_REG_WINDOW + 0x20]
+	stw	%i1, [%g3 + TI_REG_WINDOW + 0x24]
+	stw	%i2, [%g3 + TI_REG_WINDOW + 0x28]
+	stw	%i3, [%g3 + TI_REG_WINDOW + 0x2c]
+	stw	%i4, [%g3 + TI_REG_WINDOW + 0x30]
+	stw	%i5, [%g3 + TI_REG_WINDOW + 0x34]
+	stw	%i6, [%g3 + TI_REG_WINDOW + 0x38]
+	stw	%i7, [%g3 + TI_REG_WINDOW + 0x3c]
+2:	add	%g1, 1, %g1
+	stb	%g1, [%g6 + TI_WSAVED]
+	rdpr	%tstate, %g1
+	andcc	%g1, TSTATE_PRIV, %g0
 	saved
-	and		%g1, TSTATE_CWP, %g1
-	be,pn		%xcc, window_scheisse_from_user_common
-	 mov		FAULT_CODE_WRITE | FAULT_CODE_DTLB | FAULT_CODE_WINFIXUP, %g4
+	be,pn	%xcc, 1f
+	 and	%g1, TSTATE_CWP, %g1
 	retry
+1:	mov	FAULT_CODE_WRITE | FAULT_CODE_DTLB | FAULT_CODE_WINFIXUP, %g4
+	stb	%g4, [%g6 + TI_FAULT_CODE]
+	stx	%g5, [%g6 + TI_FAULT_ADDR]
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	call	do_sparc64_fault
+	 add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap_clr_l6
 
-window_scheisse_from_user_common:
-	stb		%g4, [%g6 + TI_FAULT_CODE]
-	stx		%g5, [%g6 + TI_FAULT_ADDR]
-	wrpr		%g1, %cwp
-	ba,pt		%xcc, etrap
-	 rd		%pc, %g7
-	call		do_sparc64_fault
-	 add		%sp, PTREGS_OFF, %o0
-	ba,a,pt		%xcc, rtrap_clr_l6
-
-	.globl		winfix_mna, fill_fixup_mna, spill_fixup_mna
 winfix_mna:
-	andn		%g3, 0x7f, %g3
-	add		%g3, 0x78, %g3
-	wrpr		%g3, %tnpc
+	andn	%g3, 0x7f, %g3
+	add	%g3, 0x78, %g3
+	wrpr	%g3, %tnpc
 	done
-fill_fixup_mna:
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
-	be,pt		%xcc, window_mna_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
 
-	/* Please, see fill_fixup commentary about why we must preserve
-	 * %l5 and %l6 to preserve absolute correct semantics.
-	 */
-	rdpr		%wstate, %g2			! Grab user mode wstate.
-	wrpr		%g1, %cwp			! Get into the right window.
-	sll		%g2, 3, %g2			! NORMAL-->OTHER
-	wrpr		%g0, 0x0, %canrestore		! Standard etrap stuff.
-
-	wrpr		%g2, 0x0, %wstate		! This must be consistent.
-	wrpr		%g0, 0x0, %otherwin		! We know this.
-	call		set_pcontext			! Change contexts...
+fill_fixup_mna:
+	rdpr	%tstate, %g1
+	and	%g1, TSTATE_CWP, %g1
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	sethi	%hi(tlb_type), %g1
+	lduw	[%g1 + %lo(tlb_type)], %g1
+	cmp	%g1, 3
+	bne,pt	%icc, 1f
+	 add	%sp, PTREGS_OFF, %o0
+	mov	%l4, %o2
+	call	sun4v_do_mna
+	 mov	%l5, %o1
+	ba,a,pt	%xcc, rtrap_clr_l6
+1:	mov	%l4, %o1
+	mov	%l5, %o2
+	call	mem_address_unaligned
 	 nop
-	rdpr		%pstate, %l1			! Prepare to change globals.
-	mov		%g4, %o2			! Setup args for
-	mov		%g5, %o1			! final call to mem_address_unaligned.
-	andn		%l1, PSTATE_MM, %l1		! We want to be in RMO
+	ba,a,pt	%xcc, rtrap_clr_l6
 
-	mov		%g6, %o7			! Stash away current.
-	wrpr		%g0, 0x0, %tl			! Out of trap levels.
-	wrpr		%l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
-	mov		%o7, %g6			! Get current back.
-	ldx		[%g6 + TI_TASK], %g4		! Finish it.
-#ifdef CONFIG_SMP
-	mov		TSB_REG, %g1
-	ldxa		[%g1] ASI_IMMU, %g5
-#endif
-	call		mem_address_unaligned
-	 add		%sp, PTREGS_OFF, %o0
-
-	b,pt		%xcc, rtrap
-	 nop						! yes, the nop is correct
-spill_fixup_mna:
-	ldx		[%g6 + TI_FLAGS], %g1
-	andcc		%g1, _TIF_32BIT, %g0
-	ldub		[%g6 + TI_WSAVED], %g1
-	sll		%g1, 3, %g3
-	add		%g6, %g3, %g3
-	stx		%sp, [%g3 + TI_RWIN_SPTRS]
-
-	sll		%g1, 7, %g3
-	bne,pt		%xcc, 1f
-	 add		%g6, %g3, %g3
-	stx		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	stx		%l1, [%g3 + TI_REG_WINDOW + 0x08]
-	stx		%l2, [%g3 + TI_REG_WINDOW + 0x10]
-	stx		%l3, [%g3 + TI_REG_WINDOW + 0x18]
-	stx		%l4, [%g3 + TI_REG_WINDOW + 0x20]
-
-	stx		%l5, [%g3 + TI_REG_WINDOW + 0x28]
-	stx		%l6, [%g3 + TI_REG_WINDOW + 0x30]
-	stx		%l7, [%g3 + TI_REG_WINDOW + 0x38]
-	stx		%i0, [%g3 + TI_REG_WINDOW + 0x40]
-	stx		%i1, [%g3 + TI_REG_WINDOW + 0x48]
-	stx		%i2, [%g3 + TI_REG_WINDOW + 0x50]
-	stx		%i3, [%g3 + TI_REG_WINDOW + 0x58]
-	stx		%i4, [%g3 + TI_REG_WINDOW + 0x60]
-
-	stx		%i5, [%g3 + TI_REG_WINDOW + 0x68]
-	stx		%i6, [%g3 + TI_REG_WINDOW + 0x70]
-	stx		%i7, [%g3 + TI_REG_WINDOW + 0x78]
-	b,pt		%xcc, 2f
-	 add		%g1, 1, %g1
-1:	std		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	std		%l2, [%g3 + TI_REG_WINDOW + 0x08]
-	std		%l4, [%g3 + TI_REG_WINDOW + 0x10]
-
-	std		%l6, [%g3 + TI_REG_WINDOW + 0x18]
-	std		%i0, [%g3 + TI_REG_WINDOW + 0x20]
-	std		%i2, [%g3 + TI_REG_WINDOW + 0x28]
-	std		%i4, [%g3 + TI_REG_WINDOW + 0x30]
-	std		%i6, [%g3 + TI_REG_WINDOW + 0x38]
-	add		%g1, 1, %g1
-2:	stb		%g1, [%g6 + TI_WSAVED]
-	rdpr		%tstate, %g1
-
-	andcc		%g1, TSTATE_PRIV, %g0
-	saved
-	be,pn		%xcc, window_mna_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-	retry
-window_mna_from_user_common:
-	wrpr		%g1, %cwp
-	sethi		%hi(109f), %g7
-	ba,pt		%xcc, etrap
-109:	 or		%g7, %lo(109b), %g7
-	mov		%l4, %o2
-	mov		%l5, %o1
-	call		mem_address_unaligned
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 clr		%l6
-	
-	/* These are only needed for 64-bit mode processes which
-	 * put their stack pointer into the VPTE area and there
-	 * happens to be a VPTE tlb entry mapped there during
-	 * a spill/fill trap to that stack frame.
-	 */
-	.globl		winfix_dax, fill_fixup_dax, spill_fixup_dax
 winfix_dax:
-	andn		%g3, 0x7f, %g3
-	add		%g3, 0x74, %g3
-	wrpr		%g3, %tnpc
+	andn	%g3, 0x7f, %g3
+	add	%g3, 0x74, %g3
+	wrpr	%g3, %tnpc
 	done
-fill_fixup_dax:
-	rdpr		%tstate, %g1
-	andcc		%g1, TSTATE_PRIV, %g0
-	be,pt		%xcc, window_dax_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-
-	/* Please, see fill_fixup commentary about why we must preserve
-	 * %l5 and %l6 to preserve absolute correct semantics.
-	 */
-	rdpr		%wstate, %g2			! Grab user mode wstate.
-	wrpr		%g1, %cwp			! Get into the right window.
-	sll		%g2, 3, %g2			! NORMAL-->OTHER
-	wrpr		%g0, 0x0, %canrestore		! Standard etrap stuff.
 
-	wrpr		%g2, 0x0, %wstate		! This must be consistent.
-	wrpr		%g0, 0x0, %otherwin		! We know this.
-	call		set_pcontext			! Change contexts...
+fill_fixup_dax:
+	rdpr	%tstate, %g1
+	and	%g1, TSTATE_CWP, %g1
+	wrpr	%g1, %cwp
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	sethi	%hi(tlb_type), %g1
+	mov	%l4, %o1
+	lduw	[%g1 + %lo(tlb_type)], %g1
+	mov	%l5, %o2
+	cmp	%g1, 3
+	bne,pt	%icc, 1f
+	 add	%sp, PTREGS_OFF, %o0
+	call	sun4v_data_access_exception
 	 nop
-	rdpr		%pstate, %l1			! Prepare to change globals.
-	mov		%g4, %o1			! Setup args for
-	mov		%g5, %o2			! final call to spitfire_data_access_exception.
-	andn		%l1, PSTATE_MM, %l1		! We want to be in RMO
-
-	mov		%g6, %o7			! Stash away current.
-	wrpr		%g0, 0x0, %tl			! Out of trap levels.
-	wrpr		%l1, (PSTATE_IE | PSTATE_AG | PSTATE_RMO), %pstate
-	mov		%o7, %g6			! Get current back.
-	ldx		[%g6 + TI_TASK], %g4		! Finish it.
-#ifdef CONFIG_SMP
-	mov		TSB_REG, %g1
-	ldxa		[%g1] ASI_IMMU, %g5
-#endif
-	call		spitfire_data_access_exception
-	 add		%sp, PTREGS_OFF, %o0
-
-	b,pt		%xcc, rtrap
-	 nop						! yes, the nop is correct
-spill_fixup_dax:
-	ldx		[%g6 + TI_FLAGS], %g1
-	andcc		%g1, _TIF_32BIT, %g0
-	ldub		[%g6 + TI_WSAVED], %g1
-	sll		%g1, 3, %g3
-	add		%g6, %g3, %g3
-	stx		%sp, [%g3 + TI_RWIN_SPTRS]
-
-	sll		%g1, 7, %g3
-	bne,pt		%xcc, 1f
-	 add		%g6, %g3, %g3
-	stx		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	stx		%l1, [%g3 + TI_REG_WINDOW + 0x08]
-	stx		%l2, [%g3 + TI_REG_WINDOW + 0x10]
-	stx		%l3, [%g3 + TI_REG_WINDOW + 0x18]
-	stx		%l4, [%g3 + TI_REG_WINDOW + 0x20]
-
-	stx		%l5, [%g3 + TI_REG_WINDOW + 0x28]
-	stx		%l6, [%g3 + TI_REG_WINDOW + 0x30]
-	stx		%l7, [%g3 + TI_REG_WINDOW + 0x38]
-	stx		%i0, [%g3 + TI_REG_WINDOW + 0x40]
-	stx		%i1, [%g3 + TI_REG_WINDOW + 0x48]
-	stx		%i2, [%g3 + TI_REG_WINDOW + 0x50]
-	stx		%i3, [%g3 + TI_REG_WINDOW + 0x58]
-	stx		%i4, [%g3 + TI_REG_WINDOW + 0x60]
-
-	stx		%i5, [%g3 + TI_REG_WINDOW + 0x68]
-	stx		%i6, [%g3 + TI_REG_WINDOW + 0x70]
-	stx		%i7, [%g3 + TI_REG_WINDOW + 0x78]
-	b,pt		%xcc, 2f
-	 add		%g1, 1, %g1
-1:	std		%l0, [%g3 + TI_REG_WINDOW + 0x00]
-	std		%l2, [%g3 + TI_REG_WINDOW + 0x08]
-	std		%l4, [%g3 + TI_REG_WINDOW + 0x10]
-
-	std		%l6, [%g3 + TI_REG_WINDOW + 0x18]
-	std		%i0, [%g3 + TI_REG_WINDOW + 0x20]
-	std		%i2, [%g3 + TI_REG_WINDOW + 0x28]
-	std		%i4, [%g3 + TI_REG_WINDOW + 0x30]
-	std		%i6, [%g3 + TI_REG_WINDOW + 0x38]
-	add		%g1, 1, %g1
-2:	stb		%g1, [%g6 + TI_WSAVED]
-	rdpr		%tstate, %g1
-
-	andcc		%g1, TSTATE_PRIV, %g0
-	saved
-	be,pn		%xcc, window_dax_from_user_common
-	 and		%g1, TSTATE_CWP, %g1
-	retry
-window_dax_from_user_common:
-	wrpr		%g1, %cwp
-	sethi		%hi(109f), %g7
-	ba,pt		%xcc, etrap
-109:	 or		%g7, %lo(109b), %g7
-	mov		%l4, %o1
-	mov		%l5, %o2
-	call		spitfire_data_access_exception
-	 add		%sp, PTREGS_OFF, %o0
-	ba,pt		%xcc, rtrap
-	 clr		%l6
+	ba,a,pt	%xcc, rtrap_clr_l6
+1:	call	spitfire_data_access_exception
+	 nop
+	ba,a,pt	%xcc, rtrap_clr_l6
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
index c295806500f7..4a725d8985f1 100644
--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -11,7 +11,9 @@ lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
 	 VISsave.o atomic.o bitops.o \
 	 U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
 	 U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
+	 NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o NGpatch.o \
+	 NGpage.o NGbzero.o \
 	 copy_in_user.o user_fixup.o memmove.o \
-	 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o
+	 mcount.o ipcsum.o rwsem.o xor.o delay.o
 
 obj-y += iomap.o
diff --git a/arch/sparc64/lib/NGbzero.S b/arch/sparc64/lib/NGbzero.S
new file mode 100644
index 000000000000..e86baece5cc8
--- /dev/null
+++ b/arch/sparc64/lib/NGbzero.S
@@ -0,0 +1,163 @@
+/* NGbzero.S: Niagara optimized memset/clear_user.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+#include <asm/asi.h>
+
+#define EX_ST(x,y)		\
+98:	x,y;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	retl;			\
+	 mov	%o1, %o0;	\
+	.section __ex_table;	\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+	.text
+
+	.globl		NGmemset
+	.type		NGmemset, #function
+NGmemset:		/* %o0=buf, %o1=pat, %o2=len */
+	and		%o1, 0xff, %o3
+	mov		%o2, %o1
+	sllx		%o3, 8, %g1
+	or		%g1, %o3, %o2
+	sllx		%o2, 16, %g1
+	or		%g1, %o2, %o2
+	sllx		%o2, 32, %g1
+	ba,pt		%xcc, 1f
+	 or		%g1, %o2, %o2
+
+	.globl		NGbzero
+	.type		NGbzero, #function
+NGbzero:
+	clr		%o2
+1:	brz,pn		%o1, NGbzero_return
+	 mov		%o0, %o3
+
+	/* %o5: saved %asi, restored at NGbzero_done
+	 * %g7: store-init %asi to use
+	 * %o4:	non-store-init %asi to use
+	 */
+	rd		%asi, %o5
+	mov		ASI_BLK_INIT_QUAD_LDD_P, %g7
+	mov		ASI_P, %o4
+	wr		%o4, 0x0, %asi
+
+NGbzero_from_clear_user:
+	cmp		%o1, 15
+	bl,pn		%icc, NGbzero_tiny
+	 andcc		%o0, 0x7, %g1
+	be,pt		%xcc, 2f
+	 mov		8, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	EX_ST(stba %o2, [%o0 + 0x00] %asi)
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%o0, 1, %o0
+2:	cmp		%o1, 128
+	bl,pn		%icc, NGbzero_medium
+	 andcc		%o0, (64 - 1), %g1
+	be,pt		%xcc, NGbzero_pre_loop
+	 mov		64, %g2
+	sub		%g2, %g1, %g1
+	sub		%o1, %g1, %o1
+1:	EX_ST(stxa %o2, [%o0 + 0x00] %asi)
+	subcc		%g1, 8, %g1
+	bne,pt		%xcc, 1b
+	 add		%o0, 8, %o0
+
+NGbzero_pre_loop:
+	wr		%g7, 0x0, %asi
+	andn		%o1, (64 - 1), %g1
+	sub		%o1, %g1, %o1
+NGbzero_loop:
+	EX_ST(stxa %o2, [%o0 + 0x00] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x08] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x10] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x18] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x20] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x28] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x30] %asi)
+	EX_ST(stxa %o2, [%o0 + 0x38] %asi)
+	subcc		%g1, 64, %g1
+	bne,pt		%xcc, NGbzero_loop
+	 add		%o0, 64, %o0
+
+	wr		%o4, 0x0, %asi
+	brz,pn		%o1, NGbzero_done
+NGbzero_medium:
+	 andncc		%o1, 0x7, %g1
+	be,pn		%xcc, 2f
+	 sub		%o1, %g1, %o1
+1:	EX_ST(stxa %o2, [%o0 + 0x00] %asi)
+	subcc		%g1, 8, %g1
+	bne,pt		%xcc, 1b
+	 add		%o0, 8, %o0
+2:	brz,pt		%o1, NGbzero_done
+	 nop
+
+NGbzero_tiny:
+1:	EX_ST(stba %o2, [%o0 + 0x00] %asi)
+	subcc		%o1, 1, %o1
+	bne,pt		%icc, 1b
+	 add		%o0, 1, %o0
+
+	/* fallthrough */
+
+NGbzero_done:
+	wr		%o5, 0x0, %asi
+
+NGbzero_return:
+	retl
+	 mov		%o3, %o0
+	.size		NGbzero, .-NGbzero
+	.size		NGmemset, .-NGmemset
+
+	.globl		NGclear_user
+	.type		NGclear_user, #function
+NGclear_user:		/* %o0=buf, %o1=len */
+	rd		%asi, %o5
+	brz,pn		%o1, NGbzero_done
+	 clr		%o3
+	cmp		%o5, ASI_AIUS
+	bne,pn		%icc, NGbzero
+	 clr		%o2
+	mov		ASI_BLK_INIT_QUAD_LDD_AIUS, %g7
+	ba,pt		%xcc, NGbzero_from_clear_user
+	 mov		ASI_AIUS, %o4
+	.size		NGclear_user, .-NGclear_user
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara_patch_bzero
+	.type	niagara_patch_bzero,#function
+niagara_patch_bzero:
+	NG_DO_PATCH(memset, NGmemset)
+	NG_DO_PATCH(__bzero, NGbzero)
+	NG_DO_PATCH(__clear_user, NGclear_user)
+	NG_DO_PATCH(tsb_init, NGtsb_init)
+	retl
+	 nop
+	.size	niagara_patch_bzero,.-niagara_patch_bzero
diff --git a/arch/sparc64/lib/NGcopy_from_user.S b/arch/sparc64/lib/NGcopy_from_user.S
new file mode 100644
index 000000000000..2d93456f76dd
--- /dev/null
+++ b/arch/sparc64/lib/NGcopy_from_user.S
@@ -0,0 +1,37 @@
+/* NGcopy_from_user.S: Niagara optimized copy from userspace.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_LD(x)		\
+98:	x;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	wr	%g0, ASI_AIUS, %asi;\
+	retl;			\
+	 mov	1, %o0;		\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#define FUNC_NAME		NGcopy_from_user
+#define LOAD(type,addr,dest)	type##a [addr] ASI_AIUS, dest
+#define LOAD_TWIN(addr_reg,dest0,dest1)	\
+	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_AIUS, dest0
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, memcpy_user_stub;		\
+	 nop
+#endif
+
+#include "NGmemcpy.S"
diff --git a/arch/sparc64/lib/NGcopy_to_user.S b/arch/sparc64/lib/NGcopy_to_user.S
new file mode 100644
index 000000000000..34112d5054ef
--- /dev/null
+++ b/arch/sparc64/lib/NGcopy_to_user.S
@@ -0,0 +1,40 @@
+/* NGcopy_to_user.S: Niagara optimized copy to userspace.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#define EX_ST(x)		\
+98:	x;			\
+	.section .fixup;	\
+	.align 4;		\
+99:	wr	%g0, ASI_AIUS, %asi;\
+	retl;			\
+	 mov	1, %o0;		\
+	.section __ex_table,"a";\
+	.align 4;		\
+	.word 98b, 99b;		\
+	.text;			\
+	.align 4;
+
+#ifndef ASI_AIUS
+#define ASI_AIUS	0x11
+#endif
+
+#define FUNC_NAME		NGcopy_to_user
+#define STORE(type,src,addr)	type##a src, [addr] ASI_AIUS
+#define STORE_ASI		ASI_BLK_INIT_QUAD_LDD_AIUS
+#define EX_RETVAL(x)		0
+
+#ifdef __KERNEL__
+	/* Writing to %asi is _expensive_ so we hardcode it.
+	 * Reading %asi to check for KERNEL_DS is comparatively
+	 * cheap.
+	 */
+#define PREAMBLE					\
+	rd		%asi, %g1;			\
+	cmp		%g1, ASI_AIUS;			\
+	bne,pn		%icc, memcpy_user_stub;		\
+	 nop
+#endif
+
+#include "NGmemcpy.S"
diff --git a/arch/sparc64/lib/NGmemcpy.S b/arch/sparc64/lib/NGmemcpy.S
new file mode 100644
index 000000000000..8e522b3dc095
--- /dev/null
+++ b/arch/sparc64/lib/NGmemcpy.S
@@ -0,0 +1,368 @@
+/* NGmemcpy.S: Niagara optimized memcpy.
+ *
+ * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
+ */
+
+#ifdef __KERNEL__
+#include <asm/asi.h>
+#include <asm/thread_info.h>
+#define GLOBAL_SPARE	%g7
+#define RESTORE_ASI(TMP)	\
+	ldub	[%g6 + TI_CURRENT_DS], TMP;  \
+	wr	TMP, 0x0, %asi;
+#else
+#define GLOBAL_SPARE	%g5
+#define RESTORE_ASI(TMP)	\
+	wr	%g0, ASI_PNF, %asi
+#endif
+
+#ifndef STORE_ASI
+#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P
+#endif
+
+#ifndef EX_LD
+#define EX_LD(x)	x
+#endif
+
+#ifndef EX_ST
+#define EX_ST(x)	x
+#endif
+
+#ifndef EX_RETVAL
+#define EX_RETVAL(x)	x
+#endif
+
+#ifndef LOAD
+#ifndef MEMCPY_DEBUG
+#define LOAD(type,addr,dest)	type [addr], dest
+#else
+#define LOAD(type,addr,dest)	type##a [addr] 0x80, dest
+#endif
+#endif
+
+#ifndef LOAD_TWIN
+#define LOAD_TWIN(addr_reg,dest0,dest1)	\
+	ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
+#endif
+
+#ifndef STORE
+#define STORE(type,src,addr)	type src, [addr]
+#endif
+
+#ifndef STORE_INIT
+#define STORE_INIT(src,addr)	stxa src, [addr] %asi
+#endif
+
+#ifndef FUNC_NAME
+#define FUNC_NAME	NGmemcpy
+#endif
+
+#ifndef PREAMBLE
+#define PREAMBLE
+#endif
+
+#ifndef XCC
+#define XCC xcc
+#endif
+
+	.register	%g2,#scratch
+	.register	%g3,#scratch
+
+	.text
+	.align		64
+
+	.globl	FUNC_NAME
+	.type	FUNC_NAME,#function
+FUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */
+	srlx		%o2, 31, %g2
+	cmp		%g2, 0
+	tne		%xcc, 5
+	PREAMBLE
+	mov		%o0, GLOBAL_SPARE
+	cmp		%o2, 0
+	be,pn		%XCC, 85f
+	 or		%o0, %o1, %o3
+	cmp		%o2, 16
+	blu,a,pn	%XCC, 80f
+	 or		%o3, %o2, %o3
+
+	/* 2 blocks (128 bytes) is the minimum we can do the block
+	 * copy with.  We need to ensure that we'll iterate at least
+	 * once in the block copy loop.  At worst we'll need to align
+	 * the destination to a 64-byte boundary which can chew up
+	 * to (64 - 1) bytes from the length before we perform the
+	 * block copy loop.
+	 */
+	cmp		%o2, (2 * 64)
+	blu,pt		%XCC, 70f
+	 andcc		%o3, 0x7, %g0
+
+	/* %o0:	dst
+	 * %o1:	src
+	 * %o2:	len  (known to be >= 128)
+	 *
+	 * The block copy loops will use %o4/%o5,%g2/%g3 as
+	 * temporaries while copying the data.
+	 */
+
+	LOAD(prefetch, %o1, #one_read)
+	wr		%g0, STORE_ASI, %asi
+
+	/* Align destination on 64-byte boundary.  */
+	andcc		%o0, (64 - 1), %o4
+	be,pt		%XCC, 2f
+	 sub		%o4, 64, %o4
+	sub		%g0, %o4, %o4	! bytes to align dst
+	sub		%o2, %o4, %o2
+1:	subcc		%o4, 1, %o4
+	EX_LD(LOAD(ldub, %o1, %g1))
+	EX_ST(STORE(stb, %g1, %o0))
+	add		%o1, 1, %o1
+	bne,pt		%XCC, 1b
+	add		%o0, 1, %o0
+
+	/* If the source is on a 16-byte boundary we can do
+	 * the direct block copy loop.  If it is 8-byte aligned
+	 * we can do the 16-byte loads offset by -8 bytes and the
+	 * init stores offset by one register.
+	 *
+	 * If the source is not even 8-byte aligned, we need to do
+	 * shifting and masking (basically integer faligndata).
+	 *
+	 * The careful bit with init stores is that if we store
+	 * to any part of the cache line we have to store the whole
+	 * cacheline else we can end up with corrupt L2 cache line
+	 * contents.  Since the loop works on 64-bytes of 64-byte
+	 * aligned store data at a time, this is easy to ensure.
+	 */
+2:
+	andcc		%o1, (16 - 1), %o4
+	andn		%o2, (64 - 1), %g1	! block copy loop iterator
+	sub		%o2, %g1, %o2		! final sub-block copy bytes
+	be,pt		%XCC, 50f
+	 cmp		%o4, 8
+	be,a,pt		%XCC, 10f
+	 sub		%o1, 0x8, %o1
+
+	/* Neither 8-byte nor 16-byte aligned, shift and mask.  */
+	mov		%g1, %o4
+	and		%o1, 0x7, %g1
+	sll		%g1, 3, %g1
+	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1, %g2))
+	sub		%o3, %g1, %o3
+	sllx		%g2, %g1, %g2
+
+#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
+	EX_LD(LOAD(ldx, SRC, TMP1)); \
+	srlx		TMP1, PRE_SHIFT, TMP2; \
+	or		TMP2, PRE_VAL, TMP2; \
+	EX_ST(STORE_INIT(TMP2, DST)); \
+	sllx		TMP1, POST_SHIFT, PRE_VAL;
+
+1:	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
+	add		%o1, 0x8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
+	add		%o1, 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32 - 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
+	add		%o1, 8, %o1
+	SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
+	subcc		%o4, 64, %o4
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+#undef SWIVEL_ONE_DWORD
+
+	srl		%g1, 3, %g1
+	ba,pt		%XCC, 60f
+	 add		%o1, %g1, %o1
+
+10:	/* Destination is 64-byte aligned, source was only 8-byte
+	 * aligned but it has been subtracted by 8 and we perform
+	 * one twin load ahead, then add 8 back into source when
+	 * we finish the loop.
+	 */
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+1:	add		%o1, 16, %o1
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add		%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub		%o1, 32, %o1
+	EX_ST(STORE_INIT(%o5, %o0 + 0x00))	! initializes cache line
+	EX_ST(STORE_INIT(%g2, %o0 + 0x08))
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	add		%o1, 16, %o1
+	EX_ST(STORE_INIT(%g3, %o0 + 0x10))
+	EX_ST(STORE_INIT(%o4, %o0 + 0x18))
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add		%o1, 16, %o1
+	EX_ST(STORE_INIT(%o5, %o0 + 0x20))
+	EX_ST(STORE_INIT(%g2, %o0 + 0x28))
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x30))
+	EX_ST(STORE_INIT(%o4, %o0 + 0x38))
+	subcc		%g1, 64, %g1
+	bne,pt		%XCC, 1b
+	 add		%o0, 64, %o0
+
+	ba,pt		%XCC, 60f
+	 add		%o1, 0x8, %o1
+
+50:	/* Destination is 64-byte aligned, and source is 16-byte
+	 * aligned.
+	 */
+1:	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	add	%o1, 16, %o1
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add	%o1, 16 + 32, %o1
+	LOAD(prefetch, %o1, #one_read)
+	sub	%o1, 32, %o1
+	EX_ST(STORE_INIT(%o4, %o0 + 0x00))	! initializes cache line
+	EX_ST(STORE_INIT(%o5, %o0 + 0x08))
+	EX_LD(LOAD_TWIN(%o1, %o4, %o5))
+	add	%o1, 16, %o1
+	EX_ST(STORE_INIT(%g2, %o0 + 0x10))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x18))
+	EX_LD(LOAD_TWIN(%o1, %g2, %g3))
+	add	%o1, 16, %o1
+	EX_ST(STORE_INIT(%o4, %o0 + 0x20))
+	EX_ST(STORE_INIT(%o5, %o0 + 0x28))
+	EX_ST(STORE_INIT(%g2, %o0 + 0x30))
+	EX_ST(STORE_INIT(%g3, %o0 + 0x38))
+	subcc	%g1, 64, %g1
+	bne,pt	%XCC, 1b
+	 add	%o0, 64, %o0
+	/* fall through */
+
+60:	
+	/* %o2 contains any final bytes still needed to be copied
+	 * over. If anything is left, we copy it one byte at a time.
+	 */
+	RESTORE_ASI(%o3)
+	brz,pt		%o2, 85f
+	 sub		%o0, %o1, %o3
+	ba,a,pt		%XCC, 90f
+
+	.align		64
+70: /* 16 < len <= 64 */
+	bne,pn		%XCC, 75f
+	 sub		%o0, %o1, %o3
+
+72:
+	andn		%o2, 0xf, %o4
+	and		%o2, 0xf, %o2
+1:	subcc		%o4, 0x10, %o4
+	EX_LD(LOAD(ldx, %o1, %o5))
+	add		%o1, 0x08, %o1
+	EX_LD(LOAD(ldx, %o1, %g1))
+	sub		%o1, 0x08, %o1
+	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	add		%o1, 0x8, %o1
+	EX_ST(STORE(stx, %g1, %o1 + %o3))
+	bgu,pt		%XCC, 1b
+	 add		%o1, 0x8, %o1
+73:	andcc		%o2, 0x8, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x8, %o2
+	EX_LD(LOAD(ldx, %o1, %o5))
+	EX_ST(STORE(stx, %o5, %o1 + %o3))
+	add		%o1, 0x8, %o1
+1:	andcc		%o2, 0x4, %g0
+	be,pt		%XCC, 1f
+	 nop
+	sub		%o2, 0x4, %o2
+	EX_LD(LOAD(lduw, %o1, %o5))
+	EX_ST(STORE(stw, %o5, %o1 + %o3))
+	add		%o1, 0x4, %o1
+1:	cmp		%o2, 0
+	be,pt		%XCC, 85f
+	 nop
+	ba,pt		%xcc, 90f
+	 nop
+
+75:
+	andcc		%o0, 0x7, %g1
+	sub		%g1, 0x8, %g1
+	be,pn		%icc, 2f
+	 sub		%g0, %g1, %g1
+	sub		%o2, %g1, %o2
+
+1:	subcc		%g1, 1, %g1
+	EX_LD(LOAD(ldub, %o1, %o5))
+	EX_ST(STORE(stb, %o5, %o1 + %o3))
+	bgu,pt		%icc, 1b
+	 add		%o1, 1, %o1
+
+2:	add		%o1, %o3, %o0
+	andcc		%o1, 0x7, %g1
+	bne,pt		%icc, 8f
+	 sll		%g1, 3, %g1
+
+	cmp		%o2, 16
+	bgeu,pt		%icc, 72b
+	 nop
+	ba,a,pt		%xcc, 73b
+
+8:	mov		64, %o3
+	andn		%o1, 0x7, %o1
+	EX_LD(LOAD(ldx, %o1, %g2))
+	sub		%o3, %g1, %o3
+	andn		%o2, 0x7, %o4
+	sllx		%g2, %g1, %g2
+1:	add		%o1, 0x8, %o1
+	EX_LD(LOAD(ldx, %o1, %g3))
+	subcc		%o4, 0x8, %o4
+	srlx		%g3, %o3, %o5
+	or		%o5, %g2, %o5
+	EX_ST(STORE(stx, %o5, %o0))
+	add		%o0, 0x8, %o0
+	bgu,pt		%icc, 1b
+	 sllx		%g3, %g1, %g2
+
+	srl		%g1, 3, %g1
+	andcc		%o2, 0x7, %o2
+	be,pn		%icc, 85f
+	 add		%o1, %g1, %o1
+	ba,pt		%xcc, 90f
+	 sub		%o0, %o1, %o3
+
+	.align		64
+80: /* 0 < len <= 16 */
+	andcc		%o3, 0x3, %g0
+	bne,pn		%XCC, 90f
+	 sub		%o0, %o1, %o3
+
+1:
+	subcc		%o2, 4, %o2
+	EX_LD(LOAD(lduw, %o1, %g1))
+	EX_ST(STORE(stw, %g1, %o1 + %o3))
+	bgu,pt		%XCC, 1b
+	 add		%o1, 4, %o1
+
+85:	retl
+	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
+
+	.align		32
+90:
+	subcc		%o2, 1, %o2
+	EX_LD(LOAD(ldub, %o1, %g1))
+	EX_ST(STORE(stb, %g1, %o1 + %o3))
+	bgu,pt		%XCC, 90b
+	 add		%o1, 1, %o1
+	retl
+	 mov		EX_RETVAL(GLOBAL_SPARE), %o0
+
+	.size		FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc64/lib/NGpage.S b/arch/sparc64/lib/NGpage.S
new file mode 100644
index 000000000000..7d7c3bb8dcbf
--- /dev/null
+++ b/arch/sparc64/lib/NGpage.S
@@ -0,0 +1,96 @@
+/* NGpage.S: Niagara optimize clear and copy page.
+ *
+ * Copyright (C) 2006 (davem@davemloft.net)
+ */
+
+#include <asm/asi.h>
+#include <asm/page.h>
+
+	.text
+	.align	32
+
+	/* This is heavily simplified from the sun4u variants
+	 * because Niagara does not have any D-cache aliasing issues
+	 * and also we don't need to use the FPU in order to implement
+	 * an optimal page copy/clear.
+	 */
+
+NGcopy_user_page:	/* %o0=dest, %o1=src, %o2=vaddr */
+	prefetch	[%o1 + 0x00], #one_read
+	mov		8, %g1
+	mov		16, %g2
+	mov		24, %g3
+	set		PAGE_SIZE, %g7
+
+1:	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
+	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
+	prefetch	[%o1 + 0x40], #one_read
+	add		%o1, 32, %o1
+	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	ldda		[%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	ldda		[%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
+	add		%o1, 32, %o1
+	add		%o0, 32, %o0
+	stxa		%o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g7, 64, %g7
+	bne,pt		%xcc, 1b
+	 add		%o0, 32, %o0
+	retl
+	 nop
+
+NGclear_page:		/* %o0=dest */
+NGclear_user_page:	/* %o0=dest, %o1=vaddr */
+	mov		8, %g1
+	mov		16, %g2
+	mov		24, %g3
+	set		PAGE_SIZE, %g7
+
+1:	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	add		%o0, 32, %o0
+	stxa		%g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
+	stxa		%g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
+	subcc		%g7, 64, %g7
+	bne,pt		%xcc, 1b
+	 add		%o0, 32, %o0
+	retl
+	 nop
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara_patch_pageops
+	.type	niagara_patch_pageops,#function
+niagara_patch_pageops:
+	NG_DO_PATCH(copy_user_page, NGcopy_user_page)
+	NG_DO_PATCH(_clear_page, NGclear_page)
+	NG_DO_PATCH(clear_user_page, NGclear_user_page)
+	retl
+	 nop
+	.size	niagara_patch_pageops,.-niagara_patch_pageops
diff --git a/arch/sparc64/lib/NGpatch.S b/arch/sparc64/lib/NGpatch.S
new file mode 100644
index 000000000000..3b0674fc3366
--- /dev/null
+++ b/arch/sparc64/lib/NGpatch.S
@@ -0,0 +1,33 @@
+/* NGpatch.S: Patch Ultra-I routines with Niagara variant.
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#define BRANCH_ALWAYS	0x10680000
+#define NOP		0x01000000
+#define NG_DO_PATCH(OLD, NEW)	\
+	sethi	%hi(NEW), %g1; \
+	or	%g1, %lo(NEW), %g1; \
+	sethi	%hi(OLD), %g2; \
+	or	%g2, %lo(OLD), %g2; \
+	sub	%g1, %g2, %g1; \
+	sethi	%hi(BRANCH_ALWAYS), %g3; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
+	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
+	or	%g3, %g1, %g3; \
+	stw	%g3, [%g2]; \
+	sethi	%hi(NOP), %g3; \
+	or	%g3, %lo(NOP), %g3; \
+	stw	%g3, [%g2 + 0x4]; \
+	flush	%g2;
+
+	.globl	niagara_patch_copyops
+	.type	niagara_patch_copyops,#function
+niagara_patch_copyops:
+	NG_DO_PATCH(memcpy, NGmemcpy)
+	NG_DO_PATCH(___copy_from_user, NGcopy_from_user)
+	NG_DO_PATCH(___copy_to_user, NGcopy_to_user)
+	retl
+	 nop
+	.size	niagara_patch_copyops,.-niagara_patch_copyops
diff --git a/arch/sparc64/lib/U3patch.S b/arch/sparc64/lib/U3patch.S
index e2b6c5e4b95a..ecc302619a6e 100644
--- a/arch/sparc64/lib/U3patch.S
+++ b/arch/sparc64/lib/U3patch.S
@@ -12,7 +12,8 @@
 	or	%g2, %lo(OLD), %g2; \
 	sub	%g1, %g2, %g1; \
 	sethi	%hi(BRANCH_ALWAYS), %g3; \
-	srl	%g1, 2, %g1; \
+	sll	%g1, 11, %g1; \
+	srl	%g1, 11 + 2, %g1; \
 	or	%g3, %lo(BRANCH_ALWAYS), %g3; \
 	or	%g3, %g1, %g3; \
 	stw	%g3, [%g2]; \
diff --git a/arch/sparc64/lib/bzero.S b/arch/sparc64/lib/bzero.S
index 1d2abcfa4e52..c7bbae8c590f 100644
--- a/arch/sparc64/lib/bzero.S
+++ b/arch/sparc64/lib/bzero.S
@@ -98,12 +98,12 @@ __bzero_done:
 	.text;			\
 	.align 4;
 
-	.globl	__bzero_noasi
-	.type	__bzero_noasi, #function
-__bzero_noasi:		/* %o0=buf, %o1=len */
-	brz,pn		%o1, __bzero_noasi_done
+	.globl	__clear_user
+	.type	__clear_user, #function
+__clear_user:		/* %o0=buf, %o1=len */
+	brz,pn		%o1, __clear_user_done
 	 cmp		%o1, 16
-	bl,pn		%icc, __bzero_noasi_tiny
+	bl,pn		%icc, __clear_user_tiny
 	 EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes)
 	andcc		%o0, 0x3, %g0
 	be,pt		%icc, 2f
@@ -145,14 +145,14 @@ __bzero_noasi:		/* %o0=buf, %o1=len */
 	subcc		%g1, 8, %g1
 	bne,pt		%icc, 5b
 	 add		%o0, 0x8, %o0
-6:	brz,pt		%o1, __bzero_noasi_done
+6:	brz,pt		%o1, __clear_user_done
 	 nop
-__bzero_noasi_tiny:
+__clear_user_tiny:
 1:	EX_ST(stba	%g0, [%o0 + 0x00] %asi)
 	subcc		%o1, 1, %o1
 	bne,pt		%icc, 1b
 	 add		%o0, 1, %o0
-__bzero_noasi_done:
+__clear_user_done:
 	retl
 	 clr		%o0
-	.size		__bzero_noasi, .-__bzero_noasi
+	.size		__clear_user, .-__clear_user
diff --git a/arch/sparc64/lib/clear_page.S b/arch/sparc64/lib/clear_page.S
index b59884ef051d..77e531f6c2a7 100644
--- a/arch/sparc64/lib/clear_page.S
+++ b/arch/sparc64/lib/clear_page.S
@@ -9,6 +9,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/spitfire.h>
+#include <asm/head.h>
 
 	/* What we used to do was lock a TLB entry into a specific
 	 * TLB slot, clear the page with interrupts disabled, then
@@ -22,9 +23,6 @@
 	 * disable preemption during the clear.
 	 */
 
-#define TTE_BITS_TOP	(_PAGE_VALID | _PAGE_SZBITS)
-#define TTE_BITS_BOTTOM	(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_L | _PAGE_W)
-
 	.text
 
 	.globl		_clear_page
@@ -43,12 +41,11 @@ clear_user_page:	/* %o0=dest, %o1=vaddr */
 	sethi		%hi(PAGE_SIZE), %o4
 
 	sllx		%g2, 32, %g2
-	sethi		%uhi(TTE_BITS_TOP), %g3
+	sethi		%hi(PAGE_KERNEL_LOCKED), %g3
 
-	sllx		%g3, 32, %g3
+	ldx		[%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
 	sub		%o0, %g2, %g1		! paddr
 
-	or		%g3, TTE_BITS_BOTTOM, %g3
 	and		%o1, %o4, %o0		! vaddr D-cache alias bit
 
 	or		%g1, %g3, %g1		! TTE data
@@ -66,7 +63,8 @@ clear_user_page:	/* %o0=dest, %o1=vaddr */
 	wrpr		%o4, PSTATE_IE, %pstate
 	stxa		%o0, [%g3] ASI_DMMU
 	stxa		%g1, [%g0] ASI_DTLB_DATA_IN
-	flush		%g6
+	sethi		%hi(KERNBASE), %g1
+	flush		%g1
 	wrpr		%o4, 0x0, %pstate
 
 	mov		1, %o4
diff --git a/arch/sparc64/lib/copy_page.S b/arch/sparc64/lib/copy_page.S
index feebb14fd27a..37460666a5c3 100644
--- a/arch/sparc64/lib/copy_page.S
+++ b/arch/sparc64/lib/copy_page.S
@@ -23,8 +23,6 @@
 	 * disable preemption during the clear.
 	 */
 
-#define TTE_BITS_TOP	(_PAGE_VALID | _PAGE_SZBITS)
-#define TTE_BITS_BOTTOM	(_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_L | _PAGE_W)
 #define	DCACHE_SIZE	(PAGE_SIZE * 2)
 
 #if (PAGE_SHIFT == 13) || (PAGE_SHIFT == 19)
@@ -52,13 +50,12 @@ copy_user_page:		/* %o0=dest, %o1=src, %o2=vaddr */
 	sethi		%hi(PAGE_SIZE), %o3
 
 	sllx		%g2, 32, %g2
-	sethi		%uhi(TTE_BITS_TOP), %g3
+	sethi		%hi(PAGE_KERNEL_LOCKED), %g3
 
-	sllx		%g3, 32, %g3
+	ldx		[%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
 	sub		%o0, %g2, %g1		! dest paddr
 
 	sub		%o1, %g2, %g2		! src paddr
-	or		%g3, TTE_BITS_BOTTOM, %g3
 
 	and		%o2, %o3, %o0		! vaddr D-cache alias bit
 	or		%g1, %g3, %g1		! dest TTE data
diff --git a/arch/sparc64/lib/delay.c b/arch/sparc64/lib/delay.c
index e8808727617a..fb27e54a03ee 100644
--- a/arch/sparc64/lib/delay.c
+++ b/arch/sparc64/lib/delay.c
@@ -1,6 +1,6 @@
 /* delay.c: Delay loops for sparc64
  *
- * Copyright (C) 2004 David S. Miller <davem@redhat.com>
+ * Copyright (C) 2004, 2006 David S. Miller <davem@davemloft.net>
  *
  * Based heavily upon x86 variant which is:
  *	Copyright (C) 1993 Linus Torvalds
@@ -8,19 +8,16 @@
  */
 
 #include <linux/delay.h>
+#include <asm/timer.h>
 
 void __delay(unsigned long loops)
 {
-	__asm__ __volatile__(
-"	b,pt	%%xcc, 1f\n"
-"	 cmp	%0, 0\n"
-"	.align	32\n"
-"1:\n"
-"	bne,pt	%%xcc, 1b\n"
-"	 subcc	%0, 1, %0\n"
-	: "=&r" (loops)
-	: "0" (loops)
-	: "cc");
+	unsigned long bclock, now;
+	
+	bclock = tick_ops->get_tick();
+	do {
+		now = tick_ops->get_tick();
+	} while ((now-bclock) < loops);
 }
 
 /* We used to multiply by HZ after shifting down by 32 bits
diff --git a/arch/sparc64/lib/find_bit.c b/arch/sparc64/lib/find_bit.c
deleted file mode 100644
index 6059557067b4..000000000000
--- a/arch/sparc64/lib/find_bit.c
+++ /dev/null
@@ -1,127 +0,0 @@
-#include <linux/bitops.h>
-
-/**
- * find_next_bit - find the next set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
-				unsigned long offset)
-{
-	const unsigned long *p = addr + (offset >> 6);
-	unsigned long result = offset & ~63UL;
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset &= 63UL;
-	if (offset) {
-		tmp = *(p++);
-		tmp &= (~0UL << offset);
-		if (size < 64)
-			goto found_first;
-		if (tmp)
-			goto found_middle;
-		size -= 64;
-		result += 64;
-	}
-	while (size & ~63UL) {
-		if ((tmp = *(p++)))
-			goto found_middle;
-		result += 64;
-		size -= 64;
-	}
-	if (!size)
-		return result;
-	tmp = *p;
-
-found_first:
-	tmp &= (~0UL >> (64 - size));
-	if (tmp == 0UL)        /* Are any bits set? */
-		return result + size; /* Nope. */
-found_middle:
-	return result + __ffs(tmp);
-}
-
-/* find_next_zero_bit() finds the first zero bit in a bit string of length
- * 'size' bits, starting the search at bit 'offset'. This is largely based
- * on Linus's ALPHA routines, which are pretty portable BTW.
- */
-
-unsigned long find_next_zero_bit(const unsigned long *addr,
-			unsigned long size, unsigned long offset)
-{
-	const unsigned long *p = addr + (offset >> 6);
-	unsigned long result = offset & ~63UL;
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset &= 63UL;
-	if (offset) {
-		tmp = *(p++);
-		tmp |= ~0UL >> (64-offset);
-		if (size < 64)
-			goto found_first;
-		if (~tmp)
-			goto found_middle;
-		size -= 64;
-		result += 64;
-	}
-	while (size & ~63UL) {
-		if (~(tmp = *(p++)))
-			goto found_middle;
-		result += 64;
-		size -= 64;
-	}
-	if (!size)
-		return result;
-	tmp = *p;
-
-found_first:
-	tmp |= ~0UL << size;
-	if (tmp == ~0UL)        /* Are any bits zero? */
-		return result + size; /* Nope. */
-found_middle:
-	return result + ffz(tmp);
-}
-
-unsigned long find_next_zero_le_bit(unsigned long *addr, unsigned long size, unsigned long offset)
-{
-	unsigned long *p = addr + (offset >> 6);
-	unsigned long result = offset & ~63UL;
-	unsigned long tmp;
-
-	if (offset >= size)
-		return size;
-	size -= result;
-	offset &= 63UL;
-	if(offset) {
-		tmp = __swab64p(p++);
-		tmp |= (~0UL >> (64-offset));
-		if(size < 64)
-			goto found_first;
-		if(~tmp)
-			goto found_middle;
-		size -= 64;
-		result += 64;
-	}
-	while(size & ~63) {
-		if(~(tmp = __swab64p(p++)))
-			goto found_middle;
-		result += 64;
-		size -= 64;
-	}
-	if(!size)
-		return result;
-	tmp = __swab64p(p);
-found_first:
-	tmp |= (~0UL << size);
-	if (tmp == ~0UL)        /* Are any bits zero? */
-		return result + size; /* Nope. */
-found_middle:
-	return result + ffz(tmp);
-}
diff --git a/arch/sparc64/lib/xor.S b/arch/sparc64/lib/xor.S
index 4cd5d2be1ae1..a79c8888170d 100644
--- a/arch/sparc64/lib/xor.S
+++ b/arch/sparc64/lib/xor.S
@@ -2,9 +2,10 @@
  * arch/sparc64/lib/xor.S
  *
  * High speed xor_block operation for RAID4/5 utilizing the
- * UltraSparc Visual Instruction Set.
+ * UltraSparc Visual Instruction Set and Niagara store-init/twin-load.
  *
  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
  */
 
 #include <asm/visasm.h>
@@ -19,6 +20,8 @@
  */
 	.text
 	.align	32
+
+	/* VIS versions. */
 	.globl	xor_vis_2
 	.type	xor_vis_2,#function
 xor_vis_2:
@@ -352,3 +355,298 @@ xor_vis_5:
 	ret
 	 restore
 	.size	xor_vis_5, .-xor_vis_5
+
+	/* Niagara versions. */
+	.globl		xor_niagara_2
+	.type		xor_niagara_2,#function
+xor_niagara_2:		/* %o0=bytes, %o1=dest, %o2=src */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src  + 0x00 */
+	ldda		[%i1 + 0x10] %asi, %i4	/* %i4/%i5 = src  + 0x10 */
+	ldda		[%i1 + 0x20] %asi, %g2	/* %g2/%g3 = src  + 0x20 */
+	ldda		[%i1 + 0x30] %asi, %l0	/* %l0/%l1 = src  + 0x30 */
+	prefetch	[%i1 + 0x40], #one_read
+	ldda		[%i0 + 0x00] %asi, %o0  /* %o0/%o1 = dest + 0x00 */
+	ldda		[%i0 + 0x10] %asi, %o2  /* %o2/%o3 = dest + 0x10 */
+	ldda		[%i0 + 0x20] %asi, %o4  /* %o4/%o5 = dest + 0x20 */
+	ldda		[%i0 + 0x30] %asi, %l2  /* %l2/%l3 = dest + 0x30 */
+	prefetch	[%i0 + 0x40], #n_writes
+	xor		%o0, %i2, %o0
+	xor		%o1, %i3, %o1
+	stxa		%o0, [%i0 + 0x00] %asi
+	stxa		%o1, [%i0 + 0x08] %asi
+	xor		%o2, %i4, %o2
+	xor		%o3, %i5, %o3
+	stxa		%o2, [%i0 + 0x10] %asi
+	stxa		%o3, [%i0 + 0x18] %asi
+	xor		%o4, %g2, %o4
+	xor		%o5, %g3, %o5
+	stxa		%o4, [%i0 + 0x20] %asi
+	stxa		%o5, [%i0 + 0x28] %asi
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x30] %asi
+	stxa		%l3, [%i0 + 0x38] %asi
+	add		%i0, 0x40, %i0
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%i1, 0x40, %i1
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_2, .-xor_niagara_2
+
+	.globl		xor_niagara_3
+	.type		xor_niagara_3,#function
+xor_niagara_3:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	prefetch	[%i3], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+	mov		%i3, %l7
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src1 + 0x00 */
+	ldda		[%i1 + 0x10] %asi, %i4	/* %i4/%i5 = src1 + 0x10 */
+	ldda		[%l7 + 0x00] %asi, %g2	/* %g2/%g3 = src2 + 0x00 */
+	ldda		[%l7 + 0x10] %asi, %l0	/* %l0/%l1 = src2 + 0x10 */
+	ldda		[%i0 + 0x00] %asi, %o0  /* %o0/%o1 = dest + 0x00 */
+	ldda		[%i0 + 0x10] %asi, %o2  /* %o2/%o3 = dest + 0x10 */
+	xor		%g2, %i2, %g2
+	xor		%g3, %i3, %g3
+	xor		%o0, %g2, %o0
+	xor		%o1, %g3, %o1
+	stxa		%o0, [%i0 + 0x00] %asi
+	stxa		%o1, [%i0 + 0x08] %asi
+	ldda		[%i1 + 0x20] %asi, %i2	/* %i2/%i3 = src1 + 0x20 */
+	ldda		[%l7 + 0x20] %asi, %g2	/* %g2/%g3 = src2 + 0x20 */
+	ldda		[%i0 + 0x20] %asi, %o0	/* %o0/%o1 = dest + 0x20 */
+	xor		%l0, %i4, %l0
+	xor		%l1, %i5, %l1
+	xor		%o2, %l0, %o2
+	xor		%o3, %l1, %o3
+	stxa		%o2, [%i0 + 0x10] %asi
+	stxa		%o3, [%i0 + 0x18] %asi
+	ldda		[%i1 + 0x30] %asi, %i4	/* %i4/%i5 = src1 + 0x30 */
+	ldda		[%l7 + 0x30] %asi, %l0	/* %l0/%l1 = src2 + 0x30 */
+	ldda		[%i0 + 0x30] %asi, %o2	/* %o2/%o3 = dest + 0x30 */
+	prefetch	[%i1 + 0x40], #one_read
+	prefetch	[%l7 + 0x40], #one_read
+	prefetch	[%i0 + 0x40], #n_writes
+	xor		%g2, %i2, %g2
+	xor		%g3, %i3, %g3
+	xor		%o0, %g2, %o0
+	xor		%o1, %g3, %o1
+	stxa		%o0, [%i0 + 0x20] %asi
+	stxa		%o1, [%i0 + 0x28] %asi
+	xor		%l0, %i4, %l0
+	xor		%l1, %i5, %l1
+	xor		%o2, %l0, %o2
+	xor		%o3, %l1, %o3
+	stxa		%o2, [%i0 + 0x30] %asi
+	stxa		%o3, [%i0 + 0x38] %asi
+	add		%i0, 0x40, %i0
+	add		%i1, 0x40, %i1
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%l7, 0x40, %l7
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_3, .-xor_niagara_3
+
+	.globl		xor_niagara_4
+	.type		xor_niagara_4,#function
+xor_niagara_4:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	prefetch	[%i3], #one_read
+	prefetch	[%i4], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+	mov		%i3, %l7
+	mov		%i4, %l6
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src1 + 0x00 */
+	ldda		[%l7 + 0x00] %asi, %i4	/* %i4/%i5 = src2 + 0x00 */
+	ldda		[%l6 + 0x00] %asi, %g2	/* %g2/%g3 = src3 + 0x00 */
+	ldda		[%i0 + 0x00] %asi, %l0	/* %l0/%l1 = dest + 0x00 */
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x10] %asi, %i2	/* %i2/%i3 = src1 + 0x10 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%i7 + 0x10] %asi, %i4	/* %i4/%i5 = src2 + 0x10 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x00] %asi
+	stxa		%l1, [%i0 + 0x08] %asi
+	ldda		[%i6 + 0x10] %asi, %g2	/* %g2/%g3 = src3 + 0x10 */
+	ldda		[%i0 + 0x10] %asi, %l0	/* %l0/%l1 = dest + 0x10 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x20] %asi, %i2	/* %i2/%i3 = src1 + 0x20 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%i7 + 0x20] %asi, %i4	/* %i4/%i5 = src2 + 0x20 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x10] %asi
+	stxa		%l1, [%i0 + 0x18] %asi
+	ldda		[%i6 + 0x20] %asi, %g2	/* %g2/%g3 = src3 + 0x20 */
+	ldda		[%i0 + 0x20] %asi, %l0	/* %l0/%l1 = dest + 0x20 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x30] %asi, %i2	/* %i2/%i3 = src1 + 0x30 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%i7 + 0x30] %asi, %i4	/* %i4/%i5 = src2 + 0x30 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x20] %asi
+	stxa		%l1, [%i0 + 0x28] %asi
+	ldda		[%i6 + 0x30] %asi, %g2	/* %g2/%g3 = src3 + 0x30 */
+	ldda		[%i0 + 0x30] %asi, %l0	/* %l0/%l1 = dest + 0x30 */
+
+	prefetch	[%i1 + 0x40], #one_read
+	prefetch	[%l7 + 0x40], #one_read
+	prefetch	[%l6 + 0x40], #one_read
+	prefetch	[%i0 + 0x40], #n_writes
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	stxa		%l0, [%i0 + 0x30] %asi
+	stxa		%l1, [%i0 + 0x38] %asi
+
+	add		%i0, 0x40, %i0
+	add		%i1, 0x40, %i1
+	add		%l7, 0x40, %l7
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%l6, 0x40, %l6
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_4, .-xor_niagara_4
+
+	.globl		xor_niagara_5
+	.type		xor_niagara_5,#function
+xor_niagara_5:		/* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */
+	save		%sp, -192, %sp
+	prefetch	[%i1], #n_writes
+	prefetch	[%i2], #one_read
+	prefetch	[%i3], #one_read
+	prefetch	[%i4], #one_read
+	prefetch	[%i5], #one_read
+	rd		%asi, %g7
+	wr		%g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
+	srlx		%i0, 6, %g1
+	mov		%i1, %i0
+	mov		%i2, %i1
+	mov		%i3, %l7
+	mov		%i4, %l6
+	mov		%i5, %l5
+1:	ldda		[%i1 + 0x00] %asi, %i2	/* %i2/%i3 = src1 + 0x00 */
+	ldda		[%l7 + 0x00] %asi, %i4	/* %i4/%i5 = src2 + 0x00 */
+	ldda		[%l6 + 0x00] %asi, %g2	/* %g2/%g3 = src3 + 0x00 */
+	ldda		[%l5 + 0x00] %asi, %l0	/* %l0/%l1 = src4 + 0x00 */
+	ldda		[%i0 + 0x00] %asi, %l2	/* %l2/%l3 = dest + 0x00 */
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x10] %asi, %i2	/* %i2/%i3 = src1 + 0x10 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%l7 + 0x10] %asi, %i4	/* %i4/%i5 = src2 + 0x10 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	ldda		[%l6 + 0x10] %asi, %g2	/* %g2/%g3 = src3 + 0x10 */
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x00] %asi
+	stxa		%l3, [%i0 + 0x08] %asi
+	ldda		[%l5 + 0x10] %asi, %l0	/* %l0/%l1 = src4 + 0x10 */
+	ldda		[%i0 + 0x10] %asi, %l2	/* %l2/%l3 = dest + 0x10 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x20] %asi, %i2	/* %i2/%i3 = src1 + 0x20 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%l7 + 0x20] %asi, %i4	/* %i4/%i5 = src2 + 0x20 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	ldda		[%l6 + 0x20] %asi, %g2	/* %g2/%g3 = src3 + 0x20 */
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x10] %asi
+	stxa		%l3, [%i0 + 0x18] %asi
+	ldda		[%l5 + 0x20] %asi, %l0	/* %l0/%l1 = src4 + 0x20 */
+	ldda		[%i0 + 0x20] %asi, %l2	/* %l2/%l3 = dest + 0x20 */
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	ldda		[%i1 + 0x30] %asi, %i2	/* %i2/%i3 = src1 + 0x30 */
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	ldda		[%l7 + 0x30] %asi, %i4	/* %i4/%i5 = src2 + 0x30 */
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	ldda		[%l6 + 0x30] %asi, %g2	/* %g2/%g3 = src3 + 0x30 */
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x20] %asi
+	stxa		%l3, [%i0 + 0x28] %asi
+	ldda		[%l5 + 0x30] %asi, %l0	/* %l0/%l1 = src4 + 0x30 */
+	ldda		[%i0 + 0x30] %asi, %l2	/* %l2/%l3 = dest + 0x30 */
+
+	prefetch	[%i1 + 0x40], #one_read
+	prefetch	[%l7 + 0x40], #one_read
+	prefetch	[%l6 + 0x40], #one_read
+	prefetch	[%l5 + 0x40], #one_read
+	prefetch	[%i0 + 0x40], #n_writes
+
+	xor		%i4, %i2, %i4
+	xor		%i5, %i3, %i5
+	xor		%g2, %i4, %g2
+	xor		%g3, %i5, %g3
+	xor		%l0, %g2, %l0
+	xor		%l1, %g3, %l1
+	xor		%l2, %l0, %l2
+	xor		%l3, %l1, %l3
+	stxa		%l2, [%i0 + 0x30] %asi
+	stxa		%l3, [%i0 + 0x38] %asi
+
+	add		%i0, 0x40, %i0
+	add		%i1, 0x40, %i1
+	add		%l7, 0x40, %l7
+	add		%l6, 0x40, %l6
+	subcc		%g1, 1, %g1
+	bne,pt		%xcc, 1b
+	 add		%l5, 0x40, %l5
+	membar		#Sync
+	wr		%g7, 0x0, %asi
+	ret
+	 restore
+	.size		xor_niagara_5, .-xor_niagara_5
diff --git a/arch/sparc64/math-emu/math.c b/arch/sparc64/math-emu/math.c
index 2ae05cd7b773..6ee496c2864a 100644
--- a/arch/sparc64/math-emu/math.c
+++ b/arch/sparc64/math-emu/math.c
@@ -206,9 +206,29 @@ int do_mathemu(struct pt_regs *regs, struct fpustate *f)
 			case FSTOQ: TYPE(3,3,1,1,1,0,0); break;
 			case FDTOQ: TYPE(3,3,1,2,1,0,0); break;
 			case FQTOI: TYPE(3,1,0,3,1,0,0); break;
+
+			/* We can get either unimplemented or unfinished
+			 * for these cases.  Pre-Niagara systems generate
+			 * unfinished fpop for SUBNORMAL cases, and Niagara
+			 * always gives unimplemented fpop for fsqrt{s,d}.
+			 */
+			case FSQRTS: {
+				unsigned long x = current_thread_info()->xfsr[0];
+
+				x = (x >> 14) & 0xf;
+				TYPE(x,1,1,1,1,0,0);
+				break;
+			}
+
+			case FSQRTD: {
+				unsigned long x = current_thread_info()->xfsr[0];
+
+				x = (x >> 14) & 0xf;
+				TYPE(x,2,1,2,1,0,0);
+				break;
+			}
+
 			/* SUBNORMAL - ftt == 2 */
-			case FSQRTS: TYPE(2,1,1,1,1,0,0); break;
-			case FSQRTD: TYPE(2,2,1,2,1,0,0); break;
 			case FADDD:
 			case FSUBD:
 			case FMULD:
diff --git a/arch/sparc64/mm/Makefile b/arch/sparc64/mm/Makefile
index 9d0960e69f48..e415bf942bcd 100644
--- a/arch/sparc64/mm/Makefile
+++ b/arch/sparc64/mm/Makefile
@@ -5,6 +5,6 @@
 EXTRA_AFLAGS := -ansi
 EXTRA_CFLAGS := -Werror
 
-obj-y    := ultra.o tlb.o fault.o init.o generic.o
+obj-y    := ultra.o tlb.o tsb.o fault.o init.o generic.o
 
 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c
index 6f0539aa44d0..0db2f7d9fab5 100644
--- a/arch/sparc64/mm/fault.c
+++ b/arch/sparc64/mm/fault.c
@@ -29,6 +29,7 @@
 #include <asm/lsu.h>
 #include <asm/sections.h>
 #include <asm/kdebug.h>
+#include <asm/mmu_context.h>
 
 /*
  * To debug kernel to catch accesses to certain virtual/physical addresses.
@@ -91,12 +92,13 @@ static void __kprobes unhandled_fault(unsigned long address,
 	die_if_kernel("Oops", regs);
 }
 
-static void bad_kernel_pc(struct pt_regs *regs)
+static void bad_kernel_pc(struct pt_regs *regs, unsigned long vaddr)
 {
 	unsigned long *ksp;
 
 	printk(KERN_CRIT "OOPS: Bogus kernel PC [%016lx] in fault handler\n",
 	       regs->tpc);
+	printk(KERN_CRIT "OOPS: Fault was to vaddr[%lx]\n", vaddr);
 	__asm__("mov %%sp, %0" : "=r" (ksp));
 	show_stack(current, ksp);
 	unhandled_fault(regs->tpc, current, regs);
@@ -137,7 +139,7 @@ static unsigned int get_user_insn(unsigned long tpc)
 	if (!pte_present(pte))
 		goto out;
 
-	pa  = (pte_val(pte) & _PAGE_PADDR);
+	pa  = (pte_pfn(pte) << PAGE_SHIFT);
 	pa += (tpc & ~PAGE_MASK);
 
 	/* Use phys bypass so we don't pollute dtlb/dcache. */
@@ -257,7 +259,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 	struct vm_area_struct *vma;
 	unsigned int insn = 0;
 	int si_code, fault_code;
-	unsigned long address;
+	unsigned long address, mm_rss;
 
 	fault_code = get_thread_fault_code();
 
@@ -280,7 +282,7 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
 		    (tpc >= MODULES_VADDR && tpc < MODULES_END)) {
 			/* Valid, no problems... */
 		} else {
-			bad_kernel_pc(regs);
+			bad_kernel_pc(regs, address);
 			return;
 		}
 	}
@@ -406,6 +408,20 @@ good_area:
 	}
 
 	up_read(&mm->mmap_sem);
+
+	mm_rss = get_mm_rss(mm);
+#ifdef CONFIG_HUGETLB_PAGE
+	mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE));
+#endif
+	if (unlikely(mm_rss >
+		     mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit))
+		tsb_grow(mm, MM_TSB_BASE, mm_rss);
+#ifdef CONFIG_HUGETLB_PAGE
+	mm_rss = mm->context.huge_pte_count;
+	if (unlikely(mm_rss >
+		     mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit))
+		tsb_grow(mm, MM_TSB_HUGE, mm_rss);
+#endif
 	return;
 
 	/*
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index 580b63da836b..8cb06205d265 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -15,15 +15,6 @@
 #include <asm/page.h>
 #include <asm/tlbflush.h>
 
-static inline pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space)
-{
-	pte_t pte;
-	pte_val(pte) = (((page) | pgprot_val(prot) | _PAGE_E) &
-			~(unsigned long)_PAGE_CACHE);
-	pte_val(pte) |= (((unsigned long)space) << 32);
-	return pte;
-}
-
 /* Remap IO memory, the same way as remap_pfn_range(), but use
  * the obio memory space.
  *
@@ -48,24 +39,29 @@ static inline void io_remap_pte_range(struct mm_struct *mm, pte_t * pte,
 		pte_t entry;
 		unsigned long curend = address + PAGE_SIZE;
 		
-		entry = mk_pte_io(offset, prot, space);
+		entry = mk_pte_io(offset, prot, space, PAGE_SIZE);
 		if (!(address & 0xffff)) {
-			if (!(address & 0x3fffff) && !(offset & 0x3ffffe) && end >= address + 0x400000) {
-				entry = mk_pte_io(offset,
-						  __pgprot(pgprot_val (prot) | _PAGE_SZ4MB),
-						  space);
+			if (PAGE_SIZE < (4 * 1024 * 1024) &&
+			    !(address & 0x3fffff) &&
+			    !(offset & 0x3ffffe) &&
+			    end >= address + 0x400000) {
+				entry = mk_pte_io(offset, prot, space,
+						  4 * 1024 * 1024);
 				curend = address + 0x400000;
 				offset += 0x400000;
-			} else if (!(address & 0x7ffff) && !(offset & 0x7fffe) && end >= address + 0x80000) {
-				entry = mk_pte_io(offset,
-						  __pgprot(pgprot_val (prot) | _PAGE_SZ512K),
-						  space);
+			} else if (PAGE_SIZE < (512 * 1024) &&
+				   !(address & 0x7ffff) &&
+				   !(offset & 0x7fffe) &&
+				   end >= address + 0x80000) {
+				entry = mk_pte_io(offset, prot, space,
+						  512 * 1024 * 1024);
 				curend = address + 0x80000;
 				offset += 0x80000;
-			} else if (!(offset & 0xfffe) && end >= address + 0x10000) {
-				entry = mk_pte_io(offset,
-						  __pgprot(pgprot_val (prot) | _PAGE_SZ64K),
-						  space);
+			} else if (PAGE_SIZE < (64 * 1024) &&
+				   !(offset & 0xfffe) &&
+				   end >= address + 0x10000) {
+				entry = mk_pte_io(offset, prot, space,
+						  64 * 1024);
 				curend = address + 0x10000;
 				offset += 0x10000;
 			} else
@@ -144,7 +140,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
 	vma->vm_pgoff = phys_base >> PAGE_SHIFT;
 
-	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
 	flush_cache_range(vma, beg, end);
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 625cbb336a23..074620d413d4 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -1,7 +1,7 @@
 /*
  * SPARC64 Huge TLB page support.
  *
- * Copyright (C) 2002, 2003 David S. Miller (davem@redhat.com)
+ * Copyright (C) 2002, 2003, 2006 David S. Miller (davem@davemloft.net)
  */
 
 #include <linux/config.h>
@@ -22,6 +22,175 @@
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
 
+/* Slightly simplified from the non-hugepage variant because by
+ * definition we don't have to worry about any page coloring stuff
+ */
+#define VA_EXCLUDE_START (0x0000080000000000UL - (1UL << 32UL))
+#define VA_EXCLUDE_END   (0xfffff80000000000UL + (1UL << 32UL))
+
+static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
+							unsigned long addr,
+							unsigned long len,
+							unsigned long pgoff,
+							unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct * vma;
+	unsigned long task_size = TASK_SIZE;
+	unsigned long start_addr;
+
+	if (test_thread_flag(TIF_32BIT))
+		task_size = STACK_TOP32;
+	if (unlikely(len >= VA_EXCLUDE_START))
+		return -ENOMEM;
+
+	if (len > mm->cached_hole_size) {
+	        start_addr = addr = mm->free_area_cache;
+	} else {
+	        start_addr = addr = TASK_UNMAPPED_BASE;
+	        mm->cached_hole_size = 0;
+	}
+
+	task_size -= len;
+
+full_search:
+	addr = ALIGN(addr, HPAGE_SIZE);
+
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (addr < VA_EXCLUDE_START &&
+		    (addr + len) >= VA_EXCLUDE_START) {
+			addr = VA_EXCLUDE_END;
+			vma = find_vma(mm, VA_EXCLUDE_END);
+		}
+		if (unlikely(task_size < addr)) {
+			if (start_addr != TASK_UNMAPPED_BASE) {
+				start_addr = addr = TASK_UNMAPPED_BASE;
+				mm->cached_hole_size = 0;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+		if (likely(!vma || addr + len <= vma->vm_start)) {
+			/*
+			 * Remember the place where we stopped the search:
+			 */
+			mm->free_area_cache = addr + len;
+			return addr;
+		}
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+	}
+}
+
+static unsigned long
+hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+				  const unsigned long len,
+				  const unsigned long pgoff,
+				  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = addr0;
+
+	/* This should only ever run for 32-bit processes.  */
+	BUG_ON(!test_thread_flag(TIF_32BIT));
+
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+ 	        mm->cached_hole_size = 0;
+ 		mm->free_area_cache = mm->mmap_base;
+ 	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache & HPAGE_MASK;
+
+	/* make sure it can fit in the remaining address space */
+	if (likely(addr > len)) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+		}
+	}
+
+	if (unlikely(mm->mmap_base < len))
+		goto bottomup;
+
+	addr = (mm->mmap_base-len) & HPAGE_MASK;
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (likely(!vma || addr+len <= vma->vm_start)) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+		}
+
+ 		/* remember the largest hole we saw so far */
+ 		if (addr + mm->cached_hole_size < vma->vm_start)
+ 		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = (vma->vm_start-len) & HPAGE_MASK;
+	} while (likely(len < vma->vm_start));
+
+bottomup:
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->cached_hole_size = ~0UL;
+  	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
+
+unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+		unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long task_size = TASK_SIZE;
+
+	if (test_thread_flag(TIF_32BIT))
+		task_size = STACK_TOP32;
+
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (len > task_size)
+		return -ENOMEM;
+
+	if (addr) {
+		addr = ALIGN(addr, HPAGE_SIZE);
+		vma = find_vma(mm, addr);
+		if (task_size - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+	if (mm->get_unmapped_area == arch_get_unmapped_area)
+		return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+				pgoff, flags);
+	else
+		return hugetlb_get_unmapped_area_topdown(file, addr, len,
+				pgoff, flags);
+}
+
 pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -30,13 +199,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	pte_t *pte = NULL;
 
 	pgd = pgd_offset(mm, addr);
-	if (pgd) {
-		pud = pud_offset(pgd, addr);
-		if (pud) {
-			pmd = pmd_alloc(mm, pud, addr);
-			if (pmd)
-				pte = pte_alloc_map(mm, pmd, addr);
-		}
+	pud = pud_alloc(mm, pgd, addr);
+	if (pud) {
+		pmd = pmd_alloc(mm, pud, addr);
+		if (pmd)
+			pte = pte_alloc_map(mm, pmd, addr);
 	}
 	return pte;
 }
@@ -48,25 +215,28 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	pmd_t *pmd;
 	pte_t *pte = NULL;
 
+	addr &= HPAGE_MASK;
+
 	pgd = pgd_offset(mm, addr);
-	if (pgd) {
+	if (!pgd_none(*pgd)) {
 		pud = pud_offset(pgd, addr);
-		if (pud) {
+		if (!pud_none(*pud)) {
 			pmd = pmd_offset(pud, addr);
-			if (pmd)
+			if (!pmd_none(*pmd))
 				pte = pte_offset_map(pmd, addr);
 		}
 	}
 	return pte;
 }
 
-#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0)
-
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 		     pte_t *ptep, pte_t entry)
 {
 	int i;
 
+	if (!pte_present(*ptep) && pte_present(entry))
+		mm->context.huge_pte_count++;
+
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
 		set_pte_at(mm, addr, ptep, entry);
 		ptep++;
@@ -82,6 +252,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	int i;
 
 	entry = *ptep;
+	if (pte_present(entry))
+		mm->context.huge_pte_count--;
 
 	for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) {
 		pte_clear(mm, addr, ptep);
@@ -92,18 +264,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
 	return entry;
 }
 
-/*
- * This function checks for proper alignment of input addr and len parameters.
- */
-int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
-{
-	if (len & ~HPAGE_MASK)
-		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
-		return -EINVAL;
-	return 0;
-}
-
 struct page *follow_huge_addr(struct mm_struct *mm,
 			      unsigned long address, int write)
 {
@@ -131,6 +291,15 @@ static void context_reload(void *__data)
 
 void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 {
+	struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE];
+
+	if (likely(tp->tsb != NULL))
+		return;
+
+	tsb_grow(mm, MM_TSB_HUGE, 0);
+	tsb_context_switch(mm);
+	smp_tsb_sync(mm);
+
 	/* On UltraSPARC-III+ and later, configure the second half of
 	 * the Data-TLB for huge pages.
 	 */
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 1e44ee26cee8..1539a8362b6f 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -6,6 +6,7 @@
  */
  
 #include <linux/config.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/string.h>
@@ -39,9 +40,27 @@
 #include <asm/tlb.h>
 #include <asm/spitfire.h>
 #include <asm/sections.h>
+#include <asm/tsb.h>
+#include <asm/hypervisor.h>
 
 extern void device_scan(void);
 
+#define MAX_PHYS_ADDRESS	(1UL << 42UL)
+#define KPTE_BITMAP_CHUNK_SZ	(256UL * 1024UL * 1024UL)
+#define KPTE_BITMAP_BYTES	\
+	((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 8)
+
+unsigned long kern_linear_pte_xor[2] __read_mostly;
+
+/* A bitmap, one bit for every 256MB of physical memory.  If the bit
+ * is clear, we should use a 4MB page (via kern_linear_pte_xor[0]) else
+ * if set we should use a 256MB page (via kern_linear_pte_xor[1]).
+ */
+unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
+
+/* A special kernel TSB for 4MB and 256MB linear mappings.  */
+struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
+
 #define MAX_BANKS	32
 
 static struct linux_prom64_registers pavail[MAX_BANKS] __initdata;
@@ -111,11 +130,9 @@ static void __init read_obp_memory(const char *property,
 
 unsigned long *sparc64_valid_addr_bitmap __read_mostly;
 
-/* Ugly, but necessary... -DaveM */
-unsigned long phys_base __read_mostly;
+/* Kernel physical address base and size in bytes.  */
 unsigned long kern_base __read_mostly;
 unsigned long kern_size __read_mostly;
-unsigned long pfn_base __read_mostly;
 
 /* get_new_mmu_context() uses "cache + 1".  */
 DEFINE_SPINLOCK(ctx_alloc_lock);
@@ -141,24 +158,28 @@ unsigned long sparc64_kern_sec_context __read_mostly;
 
 int bigkernel = 0;
 
-/* XXX Tune this... */
-#define PGT_CACHE_LOW	25
-#define PGT_CACHE_HIGH	50
+kmem_cache_t *pgtable_cache __read_mostly;
+
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+	clear_page(addr);
+}
+
+extern void tsb_cache_init(void);
 
-void check_pgt_cache(void)
+void pgtable_cache_init(void)
 {
-	preempt_disable();
-	if (pgtable_cache_size > PGT_CACHE_HIGH) {
-		do {
-			if (pgd_quicklist)
-				free_pgd_slow(get_pgd_fast());
-			if (pte_quicklist[0])
-				free_pte_slow(pte_alloc_one_fast(NULL, 0));
-			if (pte_quicklist[1])
-				free_pte_slow(pte_alloc_one_fast(NULL, 1 << (PAGE_SHIFT + 10)));
-		} while (pgtable_cache_size > PGT_CACHE_LOW);
+	pgtable_cache = kmem_cache_create("pgtable_cache",
+					  PAGE_SIZE, PAGE_SIZE,
+					  SLAB_HWCACHE_ALIGN |
+					  SLAB_MUST_HWCACHE_ALIGN,
+					  zero_ctor,
+					  NULL);
+	if (!pgtable_cache) {
+		prom_printf("Could not create pgtable_cache\n");
+		prom_halt();
 	}
-	preempt_enable();
+	tsb_cache_init();
 }
 
 #ifdef CONFIG_DEBUG_DCFLUSH
@@ -168,8 +189,9 @@ atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0);
 #endif
 #endif
 
-__inline__ void flush_dcache_page_impl(struct page *page)
+inline void flush_dcache_page_impl(struct page *page)
 {
+	BUG_ON(tlb_type == hypervisor);
 #ifdef CONFIG_DEBUG_DCFLUSH
 	atomic_inc(&dcpage_flushes);
 #endif
@@ -186,8 +208,8 @@ __inline__ void flush_dcache_page_impl(struct page *page)
 }
 
 #define PG_dcache_dirty		PG_arch_1
-#define PG_dcache_cpu_shift	24
-#define PG_dcache_cpu_mask	(256 - 1)
+#define PG_dcache_cpu_shift	24UL
+#define PG_dcache_cpu_mask	(256UL - 1UL)
 
 #if NR_CPUS > 256
 #error D-cache dirty tracking and thread_info->cpu need fixing for > 256 cpus
@@ -243,32 +265,78 @@ static __inline__ void clear_dcache_dirty_cpu(struct page *page, unsigned long c
 			     : "g1", "g7");
 }
 
+static inline void tsb_insert(struct tsb *ent, unsigned long tag, unsigned long pte)
+{
+	unsigned long tsb_addr = (unsigned long) ent;
+
+	if (tlb_type == cheetah_plus || tlb_type == hypervisor)
+		tsb_addr = __pa(tsb_addr);
+
+	__tsb_insert(tsb_addr, tag, pte);
+}
+
+unsigned long _PAGE_ALL_SZ_BITS __read_mostly;
+unsigned long _PAGE_SZBITS __read_mostly;
+
 void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 {
-	struct page *page;
-	unsigned long pfn;
-	unsigned long pg_flags;
-
-	pfn = pte_pfn(pte);
-	if (pfn_valid(pfn) &&
-	    (page = pfn_to_page(pfn), page_mapping(page)) &&
-	    ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) {
-		int cpu = ((pg_flags >> PG_dcache_cpu_shift) &
-			   PG_dcache_cpu_mask);
-		int this_cpu = get_cpu();
-
-		/* This is just to optimize away some function calls
-		 * in the SMP case.
-		 */
-		if (cpu == this_cpu)
-			flush_dcache_page_impl(page);
-		else
-			smp_flush_dcache_page_impl(page, cpu);
+	struct mm_struct *mm;
+	struct tsb *tsb;
+	unsigned long tag, flags;
+	unsigned long tsb_index, tsb_hash_shift;
+
+	if (tlb_type != hypervisor) {
+		unsigned long pfn = pte_pfn(pte);
+		unsigned long pg_flags;
+		struct page *page;
+
+		if (pfn_valid(pfn) &&
+		    (page = pfn_to_page(pfn), page_mapping(page)) &&
+		    ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) {
+			int cpu = ((pg_flags >> PG_dcache_cpu_shift) &
+				   PG_dcache_cpu_mask);
+			int this_cpu = get_cpu();
+
+			/* This is just to optimize away some function calls
+			 * in the SMP case.
+			 */
+			if (cpu == this_cpu)
+				flush_dcache_page_impl(page);
+			else
+				smp_flush_dcache_page_impl(page, cpu);
+
+			clear_dcache_dirty_cpu(page, cpu);
 
-		clear_dcache_dirty_cpu(page, cpu);
+			put_cpu();
+		}
+	}
 
-		put_cpu();
+	mm = vma->vm_mm;
+
+	tsb_index = MM_TSB_BASE;
+	tsb_hash_shift = PAGE_SHIFT;
+
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) {
+		if ((tlb_type == hypervisor &&
+		     (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) ||
+		    (tlb_type != hypervisor &&
+		     (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U)) {
+			tsb_index = MM_TSB_HUGE;
+			tsb_hash_shift = HPAGE_SHIFT;
+		}
 	}
+#endif
+
+	tsb = mm->context.tsb_block[tsb_index].tsb;
+	tsb += ((address >> tsb_hash_shift) &
+		(mm->context.tsb_block[tsb_index].tsb_nentries - 1UL));
+	tag = (address >> 22UL);
+	tsb_insert(tsb, tag, pte_val(pte));
+
+	spin_unlock_irqrestore(&mm->context.lock, flags);
 }
 
 void flush_dcache_page(struct page *page)
@@ -276,6 +344,9 @@ void flush_dcache_page(struct page *page)
 	struct address_space *mapping;
 	int this_cpu;
 
+	if (tlb_type == hypervisor)
+		return;
+
 	/* Do not bother with the expensive D-cache flush if it
 	 * is merely the zero page.  The 'bigcore' testcase in GDB
 	 * causes this case to run millions of times.
@@ -311,7 +382,7 @@ out:
 
 void __kprobes flush_icache_range(unsigned long start, unsigned long end)
 {
-	/* Cheetah has coherent I-cache. */
+	/* Cheetah and Hypervisor platform cpus have coherent I-cache. */
 	if (tlb_type == spitfire) {
 		unsigned long kaddr;
 
@@ -320,16 +391,6 @@ void __kprobes flush_icache_range(unsigned long start, unsigned long end)
 	}
 }
 
-unsigned long page_to_pfn(struct page *page)
-{
-	return (unsigned long) ((page - mem_map) + pfn_base);
-}
-
-struct page *pfn_to_page(unsigned long pfn)
-{
-	return (mem_map + (pfn - pfn_base));
-}
-
 void show_mem(void)
 {
 	printk("Mem-info:\n");
@@ -338,7 +399,6 @@ void show_mem(void)
 	       nr_swap_pages << (PAGE_SHIFT-10));
 	printk("%ld pages of RAM\n", num_physpages);
 	printk("%d free pages\n", nr_free_pages());
-	printk("%d pages in page table cache\n",pgtable_cache_size);
 }
 
 void mmu_info(struct seq_file *m)
@@ -349,6 +409,8 @@ void mmu_info(struct seq_file *m)
 		seq_printf(m, "MMU Type\t: Cheetah+\n");
 	else if (tlb_type == spitfire)
 		seq_printf(m, "MMU Type\t: Spitfire\n");
+	else if (tlb_type == hypervisor)
+		seq_printf(m, "MMU Type\t: Hypervisor (sun4v)\n");
 	else
 		seq_printf(m, "MMU Type\t: ???\n");
 
@@ -371,45 +433,13 @@ struct linux_prom_translation {
 /* Exported for kernel TLB miss handling in ktlb.S */
 struct linux_prom_translation prom_trans[512] __read_mostly;
 unsigned int prom_trans_ents __read_mostly;
-unsigned int swapper_pgd_zero __read_mostly;
-
-extern unsigned long prom_boot_page;
-extern void prom_remap(unsigned long physpage, unsigned long virtpage, int mmu_ihandle);
-extern int prom_get_mmu_ihandle(void);
-extern void register_prom_callbacks(void);
 
 /* Exported for SMP bootup purposes. */
 unsigned long kern_locked_tte_data;
 
-/*
- * Translate PROM's mapping we capture at boot time into physical address.
- * The second parameter is only set from prom_callback() invocations.
- */
-unsigned long prom_virt_to_phys(unsigned long promva, int *error)
-{
-	int i;
-
-	for (i = 0; i < prom_trans_ents; i++) {
-		struct linux_prom_translation *p = &prom_trans[i];
-
-		if (promva >= p->virt &&
-		    promva < (p->virt + p->size)) {
-			unsigned long base = p->data & _PAGE_PADDR;
-
-			if (error)
-				*error = 0;
-			return base + (promva & (8192 - 1));
-		}
-	}
-	if (error)
-		*error = 1;
-	return 0UL;
-}
-
 /* The obp translations are saved based on 8k pagesize, since obp can
  * use a mixture of pagesizes. Misses to the LOW_OBP_ADDRESS ->
- * HI_OBP_ADDRESS range are handled in ktlb.S and do not use the vpte
- * scheme (also, see rant in inherit_locked_prom_mappings()).
+ * HI_OBP_ADDRESS range are handled in ktlb.S.
  */
 static inline int in_obp_range(unsigned long vaddr)
 {
@@ -490,6 +520,36 @@ static void __init read_obp_translations(void)
 	}
 }
 
+static void __init hypervisor_tlb_lock(unsigned long vaddr,
+				       unsigned long pte,
+				       unsigned long mmu)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	register unsigned long arg1 asm("%o1");
+	register unsigned long arg2 asm("%o2");
+	register unsigned long arg3 asm("%o3");
+
+	func = HV_FAST_MMU_MAP_PERM_ADDR;
+	arg0 = vaddr;
+	arg1 = 0;
+	arg2 = pte;
+	arg3 = mmu;
+	__asm__ __volatile__("ta	0x80"
+			     : "=&r" (func), "=&r" (arg0),
+			       "=&r" (arg1), "=&r" (arg2),
+			       "=&r" (arg3)
+			     : "0" (func), "1" (arg0), "2" (arg1),
+			       "3" (arg2), "4" (arg3));
+	if (arg0 != 0) {
+		prom_printf("hypervisor_tlb_lock[%lx:%lx:%lx:%lx]: "
+			    "errors with %lx\n", vaddr, 0, pte, mmu, arg0);
+		prom_halt();
+	}
+}
+
+static unsigned long kern_large_tte(unsigned long paddr);
+
 static void __init remap_kernel(void)
 {
 	unsigned long phys_page, tte_vaddr, tte_data;
@@ -497,25 +557,34 @@ static void __init remap_kernel(void)
 
 	tte_vaddr = (unsigned long) KERNBASE;
 	phys_page = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
-	tte_data = (phys_page | (_PAGE_VALID | _PAGE_SZ4MB |
-				 _PAGE_CP | _PAGE_CV | _PAGE_P |
-				 _PAGE_L | _PAGE_W));
+	tte_data = kern_large_tte(phys_page);
 
 	kern_locked_tte_data = tte_data;
 
-	/* Now lock us into the TLBs via OBP. */
-	prom_dtlb_load(tlb_ent, tte_data, tte_vaddr);
-	prom_itlb_load(tlb_ent, tte_data, tte_vaddr);
-	if (bigkernel) {
-		tlb_ent -= 1;
-		prom_dtlb_load(tlb_ent,
-			       tte_data + 0x400000, 
-			       tte_vaddr + 0x400000);
-		prom_itlb_load(tlb_ent,
-			       tte_data + 0x400000, 
-			       tte_vaddr + 0x400000);
+	/* Now lock us into the TLBs via Hypervisor or OBP. */
+	if (tlb_type == hypervisor) {
+		hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU);
+		hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU);
+		if (bigkernel) {
+			tte_vaddr += 0x400000;
+			tte_data += 0x400000;
+			hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_DMMU);
+			hypervisor_tlb_lock(tte_vaddr, tte_data, HV_MMU_IMMU);
+		}
+	} else {
+		prom_dtlb_load(tlb_ent, tte_data, tte_vaddr);
+		prom_itlb_load(tlb_ent, tte_data, tte_vaddr);
+		if (bigkernel) {
+			tlb_ent -= 1;
+			prom_dtlb_load(tlb_ent,
+				       tte_data + 0x400000, 
+				       tte_vaddr + 0x400000);
+			prom_itlb_load(tlb_ent,
+				       tte_data + 0x400000, 
+				       tte_vaddr + 0x400000);
+		}
+		sparc64_highest_unlocked_tlb_ent = tlb_ent - 1;
 	}
-	sparc64_highest_unlocked_tlb_ent = tlb_ent - 1;
 	if (tlb_type == cheetah_plus) {
 		sparc64_kern_pri_context = (CTX_CHEETAH_PLUS_CTX0 |
 					    CTX_CHEETAH_PLUS_NUC);
@@ -533,372 +602,14 @@ static void __init inherit_prom_mappings(void)
 	prom_printf("Remapping the kernel... ");
 	remap_kernel();
 	prom_printf("done.\n");
-
-	prom_printf("Registering callbacks... ");
-	register_prom_callbacks();
-	prom_printf("done.\n");
 }
 
-/* The OBP specifications for sun4u mark 0xfffffffc00000000 and
- * upwards as reserved for use by the firmware (I wonder if this
- * will be the same on Cheetah...).  We use this virtual address
- * range for the VPTE table mappings of the nucleus so we need
- * to zap them when we enter the PROM.  -DaveM
- */
-static void __flush_nucleus_vptes(void)
-{
-	unsigned long prom_reserved_base = 0xfffffffc00000000UL;
-	int i;
-
-	/* Only DTLB must be checked for VPTE entries. */
-	if (tlb_type == spitfire) {
-		for (i = 0; i < 63; i++) {
-			unsigned long tag;
-
-			/* Spitfire Errata #32 workaround */
-			/* NOTE: Always runs on spitfire, so no cheetah+
-			 *       page size encodings.
-			 */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (0),
-					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-			tag = spitfire_get_dtlb_tag(i);
-			if (((tag & ~(PAGE_MASK)) == 0) &&
-			    ((tag &  (PAGE_MASK)) >= prom_reserved_base)) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				spitfire_put_dtlb_data(i, 0x0UL);
-			}
-		}
-	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		for (i = 0; i < 512; i++) {
-			unsigned long tag = cheetah_get_dtlb_tag(i, 2);
-
-			if ((tag & ~PAGE_MASK) == 0 &&
-			    (tag & PAGE_MASK) >= prom_reserved_base) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				cheetah_put_dtlb_data(i, 0x0UL, 2);
-			}
-
-			if (tlb_type != cheetah_plus)
-				continue;
-
-			tag = cheetah_get_dtlb_tag(i, 3);
-
-			if ((tag & ~PAGE_MASK) == 0 &&
-			    (tag & PAGE_MASK) >= prom_reserved_base) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				cheetah_put_dtlb_data(i, 0x0UL, 3);
-			}
-		}
-	} else {
-		/* Implement me :-) */
-		BUG();
-	}
-}
-
-static int prom_ditlb_set;
-struct prom_tlb_entry {
-	int		tlb_ent;
-	unsigned long	tlb_tag;
-	unsigned long	tlb_data;
-};
-struct prom_tlb_entry prom_itlb[16], prom_dtlb[16];
-
 void prom_world(int enter)
 {
-	unsigned long pstate;
-	int i;
-
 	if (!enter)
 		set_fs((mm_segment_t) { get_thread_current_ds() });
 
-	if (!prom_ditlb_set)
-		return;
-
-	/* Make sure the following runs atomically. */
-	__asm__ __volatile__("flushw\n\t"
-			     "rdpr	%%pstate, %0\n\t"
-			     "wrpr	%0, %1, %%pstate"
-			     : "=r" (pstate)
-			     : "i" (PSTATE_IE));
-
-	if (enter) {
-		/* Kick out nucleus VPTEs. */
-		__flush_nucleus_vptes();
-
-		/* Install PROM world. */
-		for (i = 0; i < 16; i++) {
-			if (prom_dtlb[i].tlb_ent != -1) {
-				__asm__ __volatile__("stxa %0, [%1] %2\n\t"
-						     "membar #Sync"
-					: : "r" (prom_dtlb[i].tlb_tag), "r" (TLB_TAG_ACCESS),
-					"i" (ASI_DMMU));
-				if (tlb_type == spitfire)
-					spitfire_put_dtlb_data(prom_dtlb[i].tlb_ent,
-							       prom_dtlb[i].tlb_data);
-				else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-					cheetah_put_ldtlb_data(prom_dtlb[i].tlb_ent,
-							       prom_dtlb[i].tlb_data);
-			}
-			if (prom_itlb[i].tlb_ent != -1) {
-				__asm__ __volatile__("stxa %0, [%1] %2\n\t"
-						     "membar #Sync"
-						     : : "r" (prom_itlb[i].tlb_tag),
-						     "r" (TLB_TAG_ACCESS),
-						     "i" (ASI_IMMU));
-				if (tlb_type == spitfire)
-					spitfire_put_itlb_data(prom_itlb[i].tlb_ent,
-							       prom_itlb[i].tlb_data);
-				else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-					cheetah_put_litlb_data(prom_itlb[i].tlb_ent,
-							       prom_itlb[i].tlb_data);
-			}
-		}
-	} else {
-		for (i = 0; i < 16; i++) {
-			if (prom_dtlb[i].tlb_ent != -1) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-					: : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				if (tlb_type == spitfire)
-					spitfire_put_dtlb_data(prom_dtlb[i].tlb_ent, 0x0UL);
-				else
-					cheetah_put_ldtlb_data(prom_dtlb[i].tlb_ent, 0x0UL);
-			}
-			if (prom_itlb[i].tlb_ent != -1) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : : "r" (TLB_TAG_ACCESS),
-						     "i" (ASI_IMMU));
-				if (tlb_type == spitfire)
-					spitfire_put_itlb_data(prom_itlb[i].tlb_ent, 0x0UL);
-				else
-					cheetah_put_litlb_data(prom_itlb[i].tlb_ent, 0x0UL);
-			}
-		}
-	}
-	__asm__ __volatile__("wrpr	%0, 0, %%pstate"
-			     : : "r" (pstate));
-}
-
-void inherit_locked_prom_mappings(int save_p)
-{
-	int i;
-	int dtlb_seen = 0;
-	int itlb_seen = 0;
-
-	/* Fucking losing PROM has more mappings in the TLB, but
-	 * it (conveniently) fails to mention any of these in the
-	 * translations property.  The only ones that matter are
-	 * the locked PROM tlb entries, so we impose the following
-	 * irrecovable rule on the PROM, it is allowed 8 locked
-	 * entries in the ITLB and 8 in the DTLB.
-	 *
-	 * Supposedly the upper 16GB of the address space is
-	 * reserved for OBP, BUT I WISH THIS WAS DOCUMENTED
-	 * SOMEWHERE!!!!!!!!!!!!!!!!!  Furthermore the entire interface
-	 * used between the client program and the firmware on sun5
-	 * systems to coordinate mmu mappings is also COMPLETELY
-	 * UNDOCUMENTED!!!!!! Thanks S(t)un!
-	 */
-	if (save_p) {
-		for (i = 0; i < 16; i++) {
-			prom_itlb[i].tlb_ent = -1;
-			prom_dtlb[i].tlb_ent = -1;
-		}
-	}
-	if (tlb_type == spitfire) {
-		int high = sparc64_highest_unlocked_tlb_ent;
-		for (i = 0; i <= high; i++) {
-			unsigned long data;
-
-			/* Spitfire Errata #32 workaround */
-			/* NOTE: Always runs on spitfire, so no cheetah+
-			 *       page size encodings.
-			 */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (0),
-					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-			data = spitfire_get_dtlb_data(i);
-			if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) {
-				unsigned long tag;
-
-				/* Spitfire Errata #32 workaround */
-				/* NOTE: Always runs on spitfire, so no
-				 *       cheetah+ page size encodings.
-				 */
-				__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-						     "flush	%%g6"
-						     : /* No outputs */
-						     : "r" (0),
-						     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-				tag = spitfire_get_dtlb_tag(i);
-				if (save_p) {
-					prom_dtlb[dtlb_seen].tlb_ent = i;
-					prom_dtlb[dtlb_seen].tlb_tag = tag;
-					prom_dtlb[dtlb_seen].tlb_data = data;
-				}
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				spitfire_put_dtlb_data(i, 0x0UL);
-
-				dtlb_seen++;
-				if (dtlb_seen > 15)
-					break;
-			}
-		}
-
-		for (i = 0; i < high; i++) {
-			unsigned long data;
-
-			/* Spitfire Errata #32 workaround */
-			/* NOTE: Always runs on spitfire, so no
-			 *       cheetah+ page size encodings.
-			 */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (0),
-					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-			data = spitfire_get_itlb_data(i);
-			if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) {
-				unsigned long tag;
-
-				/* Spitfire Errata #32 workaround */
-				/* NOTE: Always runs on spitfire, so no
-				 *       cheetah+ page size encodings.
-				 */
-				__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-						     "flush	%%g6"
-						     : /* No outputs */
-						     : "r" (0),
-						     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-				tag = spitfire_get_itlb_tag(i);
-				if (save_p) {
-					prom_itlb[itlb_seen].tlb_ent = i;
-					prom_itlb[itlb_seen].tlb_tag = tag;
-					prom_itlb[itlb_seen].tlb_data = data;
-				}
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU));
-				spitfire_put_itlb_data(i, 0x0UL);
-
-				itlb_seen++;
-				if (itlb_seen > 15)
-					break;
-			}
-		}
-	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		int high = sparc64_highest_unlocked_tlb_ent;
-
-		for (i = 0; i <= high; i++) {
-			unsigned long data;
-
-			data = cheetah_get_ldtlb_data(i);
-			if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) {
-				unsigned long tag;
-
-				tag = cheetah_get_ldtlb_tag(i);
-				if (save_p) {
-					prom_dtlb[dtlb_seen].tlb_ent = i;
-					prom_dtlb[dtlb_seen].tlb_tag = tag;
-					prom_dtlb[dtlb_seen].tlb_data = data;
-				}
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				cheetah_put_ldtlb_data(i, 0x0UL);
-
-				dtlb_seen++;
-				if (dtlb_seen > 15)
-					break;
-			}
-		}
-
-		for (i = 0; i < high; i++) {
-			unsigned long data;
-
-			data = cheetah_get_litlb_data(i);
-			if ((data & (_PAGE_L|_PAGE_VALID)) == (_PAGE_L|_PAGE_VALID)) {
-				unsigned long tag;
-
-				tag = cheetah_get_litlb_tag(i);
-				if (save_p) {
-					prom_itlb[itlb_seen].tlb_ent = i;
-					prom_itlb[itlb_seen].tlb_tag = tag;
-					prom_itlb[itlb_seen].tlb_data = data;
-				}
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU));
-				cheetah_put_litlb_data(i, 0x0UL);
-
-				itlb_seen++;
-				if (itlb_seen > 15)
-					break;
-			}
-		}
-	} else {
-		/* Implement me :-) */
-		BUG();
-	}
-	if (save_p)
-		prom_ditlb_set = 1;
-}
-
-/* Give PROM back his world, done during reboots... */
-void prom_reload_locked(void)
-{
-	int i;
-
-	for (i = 0; i < 16; i++) {
-		if (prom_dtlb[i].tlb_ent != -1) {
-			__asm__ __volatile__("stxa %0, [%1] %2\n\t"
-					     "membar #Sync"
-				: : "r" (prom_dtlb[i].tlb_tag), "r" (TLB_TAG_ACCESS),
-				"i" (ASI_DMMU));
-			if (tlb_type == spitfire)
-				spitfire_put_dtlb_data(prom_dtlb[i].tlb_ent,
-						       prom_dtlb[i].tlb_data);
-			else if (tlb_type == cheetah || tlb_type == cheetah_plus)
-				cheetah_put_ldtlb_data(prom_dtlb[i].tlb_ent,
-						      prom_dtlb[i].tlb_data);
-		}
-
-		if (prom_itlb[i].tlb_ent != -1) {
-			__asm__ __volatile__("stxa %0, [%1] %2\n\t"
-					     "membar #Sync"
-					     : : "r" (prom_itlb[i].tlb_tag),
-					     "r" (TLB_TAG_ACCESS),
-					     "i" (ASI_IMMU));
-			if (tlb_type == spitfire)
-				spitfire_put_itlb_data(prom_itlb[i].tlb_ent,
-						       prom_itlb[i].tlb_data);
-			else
-				cheetah_put_litlb_data(prom_itlb[i].tlb_ent,
-						       prom_itlb[i].tlb_data);
-		}
-	}
+	__asm__ __volatile__("flushw");
 }
 
 #ifdef DCACHE_ALIASING_POSSIBLE
@@ -914,7 +625,7 @@ void __flush_dcache_range(unsigned long start, unsigned long end)
 			if (++n >= 512)
 				break;
 		}
-	} else {
+	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
 		start = __pa(start);
 		end = __pa(end);
 		for (va = start; va < end; va += 32)
@@ -927,63 +638,6 @@ void __flush_dcache_range(unsigned long start, unsigned long end)
 }
 #endif /* DCACHE_ALIASING_POSSIBLE */
 
-/* If not locked, zap it. */
-void __flush_tlb_all(void)
-{
-	unsigned long pstate;
-	int i;
-
-	__asm__ __volatile__("flushw\n\t"
-			     "rdpr	%%pstate, %0\n\t"
-			     "wrpr	%0, %1, %%pstate"
-			     : "=r" (pstate)
-			     : "i" (PSTATE_IE));
-	if (tlb_type == spitfire) {
-		for (i = 0; i < 64; i++) {
-			/* Spitfire Errata #32 workaround */
-			/* NOTE: Always runs on spitfire, so no
-			 *       cheetah+ page size encodings.
-			 */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (0),
-					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-			if (!(spitfire_get_dtlb_data(i) & _PAGE_L)) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
-				spitfire_put_dtlb_data(i, 0x0UL);
-			}
-
-			/* Spitfire Errata #32 workaround */
-			/* NOTE: Always runs on spitfire, so no
-			 *       cheetah+ page size encodings.
-			 */
-			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
-					     "flush	%%g6"
-					     : /* No outputs */
-					     : "r" (0),
-					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-
-			if (!(spitfire_get_itlb_data(i) & _PAGE_L)) {
-				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
-						     "membar #Sync"
-						     : /* no outputs */
-						     : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU));
-				spitfire_put_itlb_data(i, 0x0UL);
-			}
-		}
-	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
-		cheetah_flush_dtlb_all();
-		cheetah_flush_itlb_all();
-	}
-	__asm__ __volatile__("wrpr	%0, 0, %%pstate"
-			     : : "r" (pstate));
-}
-
 /* Caller does TLB context flushing on local CPU if necessary.
  * The caller also ensures that CTX_VALID(mm->context) is false.
  *
@@ -991,17 +645,21 @@ void __flush_tlb_all(void)
  * let the user have CTX 0 (nucleus) or we ever use a CTX
  * version of zero (and thus NO_CONTEXT would not be caught
  * by version mis-match tests in mmu_context.h).
+ *
+ * Always invoked with interrupts disabled.
  */
 void get_new_mmu_context(struct mm_struct *mm)
 {
 	unsigned long ctx, new_ctx;
 	unsigned long orig_pgsz_bits;
-	
+	unsigned long flags;
+	int new_version;
 
-	spin_lock(&ctx_alloc_lock);
+	spin_lock_irqsave(&ctx_alloc_lock, flags);
 	orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK);
 	ctx = (tlb_context_cache + 1) & CTX_NR_MASK;
 	new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx);
+	new_version = 0;
 	if (new_ctx >= (1 << CTX_NR_BITS)) {
 		new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1);
 		if (new_ctx >= ctx) {
@@ -1024,6 +682,7 @@ void get_new_mmu_context(struct mm_struct *mm)
 				mmu_context_bmap[i + 2] = 0;
 				mmu_context_bmap[i + 3] = 0;
 			}
+			new_version = 1;
 			goto out;
 		}
 	}
@@ -1032,79 +691,10 @@ void get_new_mmu_context(struct mm_struct *mm)
 out:
 	tlb_context_cache = new_ctx;
 	mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits;
-	spin_unlock(&ctx_alloc_lock);
-}
-
-#ifndef CONFIG_SMP
-struct pgtable_cache_struct pgt_quicklists;
-#endif
+	spin_unlock_irqrestore(&ctx_alloc_lock, flags);
 
-/* OK, we have to color these pages. The page tables are accessed
- * by non-Dcache enabled mapping in the VPTE area by the dtlb_backend.S
- * code, as well as by PAGE_OFFSET range direct-mapped addresses by 
- * other parts of the kernel. By coloring, we make sure that the tlbmiss 
- * fast handlers do not get data from old/garbage dcache lines that 
- * correspond to an old/stale virtual address (user/kernel) that 
- * previously mapped the pagetable page while accessing vpte range 
- * addresses. The idea is that if the vpte color and PAGE_OFFSET range 
- * color is the same, then when the kernel initializes the pagetable 
- * using the later address range, accesses with the first address
- * range will see the newly initialized data rather than the garbage.
- */
-#ifdef DCACHE_ALIASING_POSSIBLE
-#define DC_ALIAS_SHIFT	1
-#else
-#define DC_ALIAS_SHIFT	0
-#endif
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-	struct page *page;
-	unsigned long color;
-
-	{
-		pte_t *ptep = pte_alloc_one_fast(mm, address);
-
-		if (ptep)
-			return ptep;
-	}
-
-	color = VPTE_COLOR(address);
-	page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, DC_ALIAS_SHIFT);
-	if (page) {
-		unsigned long *to_free;
-		unsigned long paddr;
-		pte_t *pte;
-
-#ifdef DCACHE_ALIASING_POSSIBLE
-		set_page_count(page, 1);
-		ClearPageCompound(page);
-
-		set_page_count((page + 1), 1);
-		ClearPageCompound(page + 1);
-#endif
-		paddr = (unsigned long) page_address(page);
-		memset((char *)paddr, 0, (PAGE_SIZE << DC_ALIAS_SHIFT));
-
-		if (!color) {
-			pte = (pte_t *) paddr;
-			to_free = (unsigned long *) (paddr + PAGE_SIZE);
-		} else {
-			pte = (pte_t *) (paddr + PAGE_SIZE);
-			to_free = (unsigned long *) paddr;
-		}
-
-#ifdef DCACHE_ALIASING_POSSIBLE
-		/* Now free the other one up, adjust cache size. */
-		preempt_disable();
-		*to_free = (unsigned long) pte_quicklist[color ^ 0x1];
-		pte_quicklist[color ^ 0x1] = to_free;
-		pgtable_cache_size++;
-		preempt_enable();
-#endif
-
-		return pte;
-	}
-	return NULL;
+	if (unlikely(new_version))
+		smp_new_mmu_context_version();
 }
 
 void sparc_ultra_dump_itlb(void)
@@ -1196,9 +786,78 @@ void sparc_ultra_dump_dtlb(void)
 
 extern unsigned long cmdline_memory_size;
 
-unsigned long __init bootmem_init(unsigned long *pages_avail)
+/* Find a free area for the bootmem map, avoiding the kernel image
+ * and the initial ramdisk.
+ */
+static unsigned long __init choose_bootmap_pfn(unsigned long start_pfn,
+					       unsigned long end_pfn)
 {
-	unsigned long bootmap_size, start_pfn, end_pfn;
+	unsigned long avoid_start, avoid_end, bootmap_size;
+	int i;
+
+	bootmap_size = ((end_pfn - start_pfn) + 7) / 8;
+	bootmap_size = ALIGN(bootmap_size, sizeof(long));
+
+	avoid_start = avoid_end = 0;
+#ifdef CONFIG_BLK_DEV_INITRD
+	avoid_start = initrd_start;
+	avoid_end = PAGE_ALIGN(initrd_end);
+#endif
+
+#ifdef CONFIG_DEBUG_BOOTMEM
+	prom_printf("choose_bootmap_pfn: kern[%lx:%lx] avoid[%lx:%lx]\n",
+		    kern_base, PAGE_ALIGN(kern_base + kern_size),
+		    avoid_start, avoid_end);
+#endif
+	for (i = 0; i < pavail_ents; i++) {
+		unsigned long start, end;
+
+		start = pavail[i].phys_addr;
+		end = start + pavail[i].reg_size;
+
+		while (start < end) {
+			if (start >= kern_base &&
+			    start < PAGE_ALIGN(kern_base + kern_size)) {
+				start = PAGE_ALIGN(kern_base + kern_size);
+				continue;
+			}
+			if (start >= avoid_start && start < avoid_end) {
+				start = avoid_end;
+				continue;
+			}
+
+			if ((end - start) < bootmap_size)
+				break;
+
+			if (start < kern_base &&
+			    (start + bootmap_size) > kern_base) {
+				start = PAGE_ALIGN(kern_base + kern_size);
+				continue;
+			}
+
+			if (start < avoid_start &&
+			    (start + bootmap_size) > avoid_start) {
+				start = avoid_end;
+				continue;
+			}
+
+			/* OK, it doesn't overlap anything, use it.  */
+#ifdef CONFIG_DEBUG_BOOTMEM
+			prom_printf("choose_bootmap_pfn: Using %lx [%lx]\n",
+				    start >> PAGE_SHIFT, start);
+#endif
+			return start >> PAGE_SHIFT;
+		}
+	}
+
+	prom_printf("Cannot find free area for bootmap, aborting.\n");
+	prom_halt();
+}
+
+static unsigned long __init bootmem_init(unsigned long *pages_avail,
+					 unsigned long phys_base)
+{
+	unsigned long bootmap_size, end_pfn;
 	unsigned long end_of_phys_memory = 0UL;
 	unsigned long bootmap_pfn, bytes_avail, size;
 	int i;
@@ -1236,14 +895,6 @@ unsigned long __init bootmem_init(unsigned long *pages_avail)
 
 	*pages_avail = bytes_avail >> PAGE_SHIFT;
 
-	/* Start with page aligned address of last symbol in kernel
-	 * image.  The kernel is hard mapped below PAGE_OFFSET in a
-	 * 4MB locked TLB translation.
-	 */
-	start_pfn = PAGE_ALIGN(kern_base + kern_size) >> PAGE_SHIFT;
-
-	bootmap_pfn = start_pfn;
-
 	end_pfn = end_of_phys_memory >> PAGE_SHIFT;
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -1260,23 +911,22 @@ unsigned long __init bootmem_init(unsigned long *pages_avail)
 		                 	 "(0x%016lx > 0x%016lx)\ndisabling initrd\n",
 			       initrd_end, end_of_phys_memory);
 			initrd_start = 0;
-		}
-		if (initrd_start) {
-			if (initrd_start >= (start_pfn << PAGE_SHIFT) &&
-			    initrd_start < (start_pfn << PAGE_SHIFT) + 2 * PAGE_SIZE)
-				bootmap_pfn = PAGE_ALIGN (initrd_end) >> PAGE_SHIFT;
+			initrd_end = 0;
 		}
 	}
 #endif	
 	/* Initialize the boot-time allocator. */
 	max_pfn = max_low_pfn = end_pfn;
-	min_low_pfn = pfn_base;
+	min_low_pfn = (phys_base >> PAGE_SHIFT);
+
+	bootmap_pfn = choose_bootmap_pfn(min_low_pfn, end_pfn);
 
 #ifdef CONFIG_DEBUG_BOOTMEM
 	prom_printf("init_bootmem(min[%lx], bootmap[%lx], max[%lx])\n",
 		    min_low_pfn, bootmap_pfn, max_low_pfn);
 #endif
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn, pfn_base, end_pfn);
+	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn,
+					 min_low_pfn, end_pfn);
 
 	/* Now register the available physical memory with the
 	 * allocator.
@@ -1324,9 +974,26 @@ unsigned long __init bootmem_init(unsigned long *pages_avail)
 	reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size);
 	*pages_avail -= PAGE_ALIGN(size) >> PAGE_SHIFT;
 
+	for (i = 0; i < pavail_ents; i++) {
+		unsigned long start_pfn, end_pfn;
+
+		start_pfn = pavail[i].phys_addr >> PAGE_SHIFT;
+		end_pfn = (start_pfn + (pavail[i].reg_size >> PAGE_SHIFT));
+#ifdef CONFIG_DEBUG_BOOTMEM
+		prom_printf("memory_present(0, %lx, %lx)\n",
+			    start_pfn, end_pfn);
+#endif
+		memory_present(0, start_pfn, end_pfn);
+	}
+
+	sparse_init();
+
 	return end_pfn;
 }
 
+static struct linux_prom64_registers pall[MAX_BANKS] __initdata;
+static int pall_ents __initdata;
+
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static unsigned long kernel_map_range(unsigned long pstart, unsigned long pend, pgprot_t prot)
 {
@@ -1382,14 +1049,44 @@ static unsigned long kernel_map_range(unsigned long pstart, unsigned long pend,
 	return alloc_bytes;
 }
 
-static struct linux_prom64_registers pall[MAX_BANKS] __initdata;
-static int pall_ents __initdata;
-
 extern unsigned int kvmap_linear_patch[1];
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
+static void __init mark_kpte_bitmap(unsigned long start, unsigned long end)
+{
+	const unsigned long shift_256MB = 28;
+	const unsigned long mask_256MB = ((1UL << shift_256MB) - 1UL);
+	const unsigned long size_256MB = (1UL << shift_256MB);
+
+	while (start < end) {
+		long remains;
+
+		remains = end - start;
+		if (remains < size_256MB)
+			break;
+
+		if (start & mask_256MB) {
+			start = (start + size_256MB) & ~mask_256MB;
+			continue;
+		}
+
+		while (remains >= size_256MB) {
+			unsigned long index = start >> shift_256MB;
+
+			__set_bit(index, kpte_linear_bitmap);
+
+			start += size_256MB;
+			remains -= size_256MB;
+		}
+	}
+}
 
 static void __init kernel_physical_mapping_init(void)
 {
-	unsigned long i, mem_alloced = 0UL;
+	unsigned long i;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	unsigned long mem_alloced = 0UL;
+#endif
 
 	read_obp_memory("reg", &pall[0], &pall_ents);
 
@@ -1398,10 +1095,16 @@ static void __init kernel_physical_mapping_init(void)
 
 		phys_start = pall[i].phys_addr;
 		phys_end = phys_start + pall[i].reg_size;
+
+		mark_kpte_bitmap(phys_start, phys_end);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
 		mem_alloced += kernel_map_range(phys_start, phys_end,
 						PAGE_KERNEL);
+#endif
 	}
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
 	printk("Allocated %ld bytes for kernel page tables.\n",
 	       mem_alloced);
 
@@ -1409,8 +1112,10 @@ static void __init kernel_physical_mapping_init(void)
 	flushi(&kvmap_linear_patch[0]);
 
 	__flush_tlb_all();
+#endif
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
 void kernel_map_pages(struct page *page, int numpages, int enable)
 {
 	unsigned long phys_start = page_to_pfn(page) << PAGE_SHIFT;
@@ -1419,6 +1124,9 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
 	kernel_map_range(phys_start, phys_end,
 			 (enable ? PAGE_KERNEL : __pgprot(0)));
 
+	flush_tsb_kernel_range(PAGE_OFFSET + phys_start,
+			       PAGE_OFFSET + phys_end);
+
 	/* we should perform an IPI and flush all tlbs,
 	 * but that can deadlock->flush only current cpu.
 	 */
@@ -1439,18 +1147,150 @@ unsigned long __init find_ecache_flush_span(unsigned long size)
 	return ~0UL;
 }
 
+static void __init tsb_phys_patch(void)
+{
+	struct tsb_ldquad_phys_patch_entry *pquad;
+	struct tsb_phys_patch_entry *p;
+
+	pquad = &__tsb_ldquad_phys_patch;
+	while (pquad < &__tsb_ldquad_phys_patch_end) {
+		unsigned long addr = pquad->addr;
+
+		if (tlb_type == hypervisor)
+			*(unsigned int *) addr = pquad->sun4v_insn;
+		else
+			*(unsigned int *) addr = pquad->sun4u_insn;
+		wmb();
+		__asm__ __volatile__("flush	%0"
+				     : /* no outputs */
+				     : "r" (addr));
+
+		pquad++;
+	}
+
+	p = &__tsb_phys_patch;
+	while (p < &__tsb_phys_patch_end) {
+		unsigned long addr = p->addr;
+
+		*(unsigned int *) addr = p->insn;
+		wmb();
+		__asm__ __volatile__("flush	%0"
+				     : /* no outputs */
+				     : "r" (addr));
+
+		p++;
+	}
+}
+
+/* Don't mark as init, we give this to the Hypervisor.  */
+static struct hv_tsb_descr ktsb_descr[2];
+extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+
+static void __init sun4v_ktsb_init(void)
+{
+	unsigned long ktsb_pa;
+
+	/* First KTSB for PAGE_SIZE mappings.  */
+	ktsb_pa = kern_base + ((unsigned long)&swapper_tsb[0] - KERNBASE);
+
+	switch (PAGE_SIZE) {
+	case 8 * 1024:
+	default:
+		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_8K;
+		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_8K;
+		break;
+
+	case 64 * 1024:
+		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_64K;
+		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_64K;
+		break;
+
+	case 512 * 1024:
+		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_512K;
+		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_512K;
+		break;
+
+	case 4 * 1024 * 1024:
+		ktsb_descr[0].pgsz_idx = HV_PGSZ_IDX_4MB;
+		ktsb_descr[0].pgsz_mask = HV_PGSZ_MASK_4MB;
+		break;
+	};
+
+	ktsb_descr[0].assoc = 1;
+	ktsb_descr[0].num_ttes = KERNEL_TSB_NENTRIES;
+	ktsb_descr[0].ctx_idx = 0;
+	ktsb_descr[0].tsb_base = ktsb_pa;
+	ktsb_descr[0].resv = 0;
+
+	/* Second KTSB for 4MB/256MB mappings.  */
+	ktsb_pa = (kern_base +
+		   ((unsigned long)&swapper_4m_tsb[0] - KERNBASE));
+
+	ktsb_descr[1].pgsz_idx = HV_PGSZ_IDX_4MB;
+	ktsb_descr[1].pgsz_mask = (HV_PGSZ_MASK_4MB |
+				   HV_PGSZ_MASK_256MB);
+	ktsb_descr[1].assoc = 1;
+	ktsb_descr[1].num_ttes = KERNEL_TSB4M_NENTRIES;
+	ktsb_descr[1].ctx_idx = 0;
+	ktsb_descr[1].tsb_base = ktsb_pa;
+	ktsb_descr[1].resv = 0;
+}
+
+void __cpuinit sun4v_ktsb_register(void)
+{
+	register unsigned long func asm("%o5");
+	register unsigned long arg0 asm("%o0");
+	register unsigned long arg1 asm("%o1");
+	unsigned long pa;
+
+	pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE);
+
+	func = HV_FAST_MMU_TSB_CTX0;
+	arg0 = 2;
+	arg1 = pa;
+	__asm__ __volatile__("ta	%6"
+			     : "=&r" (func), "=&r" (arg0), "=&r" (arg1)
+			     : "0" (func), "1" (arg0), "2" (arg1),
+			       "i" (HV_FAST_TRAP));
+}
+
 /* paging_init() sets up the page tables */
 
 extern void cheetah_ecache_flush_init(void);
+extern void sun4v_patch_tlb_handlers(void);
 
 static unsigned long last_valid_pfn;
 pgd_t swapper_pg_dir[2048];
 
+static void sun4u_pgprot_init(void);
+static void sun4v_pgprot_init(void);
+
 void __init paging_init(void)
 {
-	unsigned long end_pfn, pages_avail, shift;
+	unsigned long end_pfn, pages_avail, shift, phys_base;
 	unsigned long real_end, i;
 
+	kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
+	kern_size = (unsigned long)&_end - (unsigned long)KERNBASE;
+
+	/* Invalidate both kernel TSBs.  */
+	memset(swapper_tsb, 0x40, sizeof(swapper_tsb));
+	memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb));
+
+	if (tlb_type == hypervisor)
+		sun4v_pgprot_init();
+	else
+		sun4u_pgprot_init();
+
+	if (tlb_type == cheetah_plus ||
+	    tlb_type == hypervisor)
+		tsb_phys_patch();
+
+	if (tlb_type == hypervisor) {
+		sun4v_patch_tlb_handlers();
+		sun4v_ktsb_init();
+	}
+
 	/* Find available physical memory... */
 	read_obp_memory("available", &pavail[0], &pavail_ents);
 
@@ -1458,11 +1298,6 @@ void __init paging_init(void)
 	for (i = 0; i < pavail_ents; i++)
 		phys_base = min(phys_base, pavail[i].phys_addr);
 
-	pfn_base = phys_base >> PAGE_SHIFT;
-
-	kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
-	kern_size = (unsigned long)&_end - (unsigned long)KERNBASE;
-
 	set_bit(0, mmu_context_bmap);
 
 	shift = kern_base + PAGE_OFFSET - ((unsigned long)KERNBASE);
@@ -1486,47 +1321,38 @@ void __init paging_init(void)
 	pud_set(pud_offset(&swapper_pg_dir[0], 0),
 		swapper_low_pmd_dir + (shift / sizeof(pgd_t)));
 	
-	swapper_pgd_zero = pgd_val(swapper_pg_dir[0]);
-	
 	inherit_prom_mappings();
 	
-	/* Ok, we can use our TLB miss and window trap handlers safely.
-	 * We need to do a quick peek here to see if we are on StarFire
-	 * or not, so setup_tba can setup the IRQ globals correctly (it
-	 * needs to get the hard smp processor id correctly).
-	 */
-	{
-		extern void setup_tba(int);
-		setup_tba(this_is_starfire);
-	}
-
-	inherit_locked_prom_mappings(1);
+	/* Ok, we can use our TLB miss and window trap handlers safely.  */
+	setup_tba();
 
 	__flush_tlb_all();
 
+	if (tlb_type == hypervisor)
+		sun4v_ktsb_register();
+
 	/* Setup bootmem... */
 	pages_avail = 0;
-	last_valid_pfn = end_pfn = bootmem_init(&pages_avail);
+	last_valid_pfn = end_pfn = bootmem_init(&pages_avail, phys_base);
+
+	max_mapnr = last_valid_pfn;
 
-#ifdef CONFIG_DEBUG_PAGEALLOC
 	kernel_physical_mapping_init();
-#endif
 
 	{
 		unsigned long zones_size[MAX_NR_ZONES];
 		unsigned long zholes_size[MAX_NR_ZONES];
-		unsigned long npages;
 		int znum;
 
 		for (znum = 0; znum < MAX_NR_ZONES; znum++)
 			zones_size[znum] = zholes_size[znum] = 0;
 
-		npages = end_pfn - pfn_base;
-		zones_size[ZONE_DMA] = npages;
-		zholes_size[ZONE_DMA] = npages - pages_avail;
+		zones_size[ZONE_DMA] = end_pfn;
+		zholes_size[ZONE_DMA] = end_pfn - pages_avail;
 
 		free_area_init_node(0, &contig_page_data, zones_size,
-				    phys_base >> PAGE_SHIFT, zholes_size);
+				    __pa(PAGE_OFFSET) >> PAGE_SHIFT,
+				    zholes_size);
 	}
 
 	device_scan();
@@ -1596,7 +1422,6 @@ void __init mem_init(void)
 
 	taint_real_pages();
 
-	max_mapnr = last_valid_pfn - pfn_base;
 	high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
 #ifdef CONFIG_DEBUG_BOOTMEM
@@ -1653,7 +1478,7 @@ void free_initmem(void)
 		p = virt_to_page(page);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
@@ -1669,10 +1494,349 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		struct page *p = virt_to_page(start);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
 	}
 }
 #endif
+
+#define _PAGE_CACHE_4U	(_PAGE_CP_4U | _PAGE_CV_4U)
+#define _PAGE_CACHE_4V	(_PAGE_CP_4V | _PAGE_CV_4V)
+#define __DIRTY_BITS_4U	 (_PAGE_MODIFIED_4U | _PAGE_WRITE_4U | _PAGE_W_4U)
+#define __DIRTY_BITS_4V	 (_PAGE_MODIFIED_4V | _PAGE_WRITE_4V | _PAGE_W_4V)
+#define __ACCESS_BITS_4U (_PAGE_ACCESSED_4U | _PAGE_READ_4U | _PAGE_R)
+#define __ACCESS_BITS_4V (_PAGE_ACCESSED_4V | _PAGE_READ_4V | _PAGE_R)
+
+pgprot_t PAGE_KERNEL __read_mostly;
+EXPORT_SYMBOL(PAGE_KERNEL);
+
+pgprot_t PAGE_KERNEL_LOCKED __read_mostly;
+pgprot_t PAGE_COPY __read_mostly;
+
+pgprot_t PAGE_SHARED __read_mostly;
+EXPORT_SYMBOL(PAGE_SHARED);
+
+pgprot_t PAGE_EXEC __read_mostly;
+unsigned long pg_iobits __read_mostly;
+
+unsigned long _PAGE_IE __read_mostly;
+
+unsigned long _PAGE_E __read_mostly;
+EXPORT_SYMBOL(_PAGE_E);
+
+unsigned long _PAGE_CACHE __read_mostly;
+EXPORT_SYMBOL(_PAGE_CACHE);
+
+static void prot_init_common(unsigned long page_none,
+			     unsigned long page_shared,
+			     unsigned long page_copy,
+			     unsigned long page_readonly,
+			     unsigned long page_exec_bit)
+{
+	PAGE_COPY = __pgprot(page_copy);
+	PAGE_SHARED = __pgprot(page_shared);
+
+	protection_map[0x0] = __pgprot(page_none);
+	protection_map[0x1] = __pgprot(page_readonly & ~page_exec_bit);
+	protection_map[0x2] = __pgprot(page_copy & ~page_exec_bit);
+	protection_map[0x3] = __pgprot(page_copy & ~page_exec_bit);
+	protection_map[0x4] = __pgprot(page_readonly);
+	protection_map[0x5] = __pgprot(page_readonly);
+	protection_map[0x6] = __pgprot(page_copy);
+	protection_map[0x7] = __pgprot(page_copy);
+	protection_map[0x8] = __pgprot(page_none);
+	protection_map[0x9] = __pgprot(page_readonly & ~page_exec_bit);
+	protection_map[0xa] = __pgprot(page_shared & ~page_exec_bit);
+	protection_map[0xb] = __pgprot(page_shared & ~page_exec_bit);
+	protection_map[0xc] = __pgprot(page_readonly);
+	protection_map[0xd] = __pgprot(page_readonly);
+	protection_map[0xe] = __pgprot(page_shared);
+	protection_map[0xf] = __pgprot(page_shared);
+}
+
+static void __init sun4u_pgprot_init(void)
+{
+	unsigned long page_none, page_shared, page_copy, page_readonly;
+	unsigned long page_exec_bit;
+
+	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
+				_PAGE_CACHE_4U | _PAGE_P_4U |
+				__ACCESS_BITS_4U | __DIRTY_BITS_4U |
+				_PAGE_EXEC_4U);
+	PAGE_KERNEL_LOCKED = __pgprot (_PAGE_PRESENT_4U | _PAGE_VALID |
+				       _PAGE_CACHE_4U | _PAGE_P_4U |
+				       __ACCESS_BITS_4U | __DIRTY_BITS_4U |
+				       _PAGE_EXEC_4U | _PAGE_L_4U);
+	PAGE_EXEC = __pgprot(_PAGE_EXEC_4U);
+
+	_PAGE_IE = _PAGE_IE_4U;
+	_PAGE_E = _PAGE_E_4U;
+	_PAGE_CACHE = _PAGE_CACHE_4U;
+
+	pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U |
+		     __ACCESS_BITS_4U | _PAGE_E_4U);
+
+	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^
+		0xfffff80000000000;
+	kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U |
+				   _PAGE_P_4U | _PAGE_W_4U);
+
+	/* XXX Should use 256MB on Panther. XXX */
+	kern_linear_pte_xor[1] = kern_linear_pte_xor[0];
+
+	_PAGE_SZBITS = _PAGE_SZBITS_4U;
+	_PAGE_ALL_SZ_BITS =  (_PAGE_SZ4MB_4U | _PAGE_SZ512K_4U |
+			      _PAGE_SZ64K_4U | _PAGE_SZ8K_4U |
+			      _PAGE_SZ32MB_4U | _PAGE_SZ256MB_4U);
+
+
+	page_none = _PAGE_PRESENT_4U | _PAGE_ACCESSED_4U | _PAGE_CACHE_4U;
+	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
+		       __ACCESS_BITS_4U | _PAGE_WRITE_4U | _PAGE_EXEC_4U);
+	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
+		       __ACCESS_BITS_4U | _PAGE_EXEC_4U);
+	page_readonly   = (_PAGE_VALID | _PAGE_PRESENT_4U | _PAGE_CACHE_4U |
+			   __ACCESS_BITS_4U | _PAGE_EXEC_4U);
+
+	page_exec_bit = _PAGE_EXEC_4U;
+
+	prot_init_common(page_none, page_shared, page_copy, page_readonly,
+			 page_exec_bit);
+}
+
+static void __init sun4v_pgprot_init(void)
+{
+	unsigned long page_none, page_shared, page_copy, page_readonly;
+	unsigned long page_exec_bit;
+
+	PAGE_KERNEL = __pgprot (_PAGE_PRESENT_4V | _PAGE_VALID |
+				_PAGE_CACHE_4V | _PAGE_P_4V |
+				__ACCESS_BITS_4V | __DIRTY_BITS_4V |
+				_PAGE_EXEC_4V);
+	PAGE_KERNEL_LOCKED = PAGE_KERNEL;
+	PAGE_EXEC = __pgprot(_PAGE_EXEC_4V);
+
+	_PAGE_IE = _PAGE_IE_4V;
+	_PAGE_E = _PAGE_E_4V;
+	_PAGE_CACHE = _PAGE_CACHE_4V;
+
+	kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^
+		0xfffff80000000000;
+	kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+				   _PAGE_P_4V | _PAGE_W_4V);
+
+	kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^
+		0xfffff80000000000;
+	kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V |
+				   _PAGE_P_4V | _PAGE_W_4V);
+
+	pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4V | __DIRTY_BITS_4V |
+		     __ACCESS_BITS_4V | _PAGE_E_4V);
+
+	_PAGE_SZBITS = _PAGE_SZBITS_4V;
+	_PAGE_ALL_SZ_BITS = (_PAGE_SZ16GB_4V | _PAGE_SZ2GB_4V |
+			     _PAGE_SZ256MB_4V | _PAGE_SZ32MB_4V |
+			     _PAGE_SZ4MB_4V | _PAGE_SZ512K_4V |
+			     _PAGE_SZ64K_4V | _PAGE_SZ8K_4V);
+
+	page_none = _PAGE_PRESENT_4V | _PAGE_ACCESSED_4V | _PAGE_CACHE_4V;
+	page_shared = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+		       __ACCESS_BITS_4V | _PAGE_WRITE_4V | _PAGE_EXEC_4V);
+	page_copy   = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+		       __ACCESS_BITS_4V | _PAGE_EXEC_4V);
+	page_readonly = (_PAGE_VALID | _PAGE_PRESENT_4V | _PAGE_CACHE_4V |
+			 __ACCESS_BITS_4V | _PAGE_EXEC_4V);
+
+	page_exec_bit = _PAGE_EXEC_4V;
+
+	prot_init_common(page_none, page_shared, page_copy, page_readonly,
+			 page_exec_bit);
+}
+
+unsigned long pte_sz_bits(unsigned long sz)
+{
+	if (tlb_type == hypervisor) {
+		switch (sz) {
+		case 8 * 1024:
+		default:
+			return _PAGE_SZ8K_4V;
+		case 64 * 1024:
+			return _PAGE_SZ64K_4V;
+		case 512 * 1024:
+			return _PAGE_SZ512K_4V;
+		case 4 * 1024 * 1024:
+			return _PAGE_SZ4MB_4V;
+		};
+	} else {
+		switch (sz) {
+		case 8 * 1024:
+		default:
+			return _PAGE_SZ8K_4U;
+		case 64 * 1024:
+			return _PAGE_SZ64K_4U;
+		case 512 * 1024:
+			return _PAGE_SZ512K_4U;
+		case 4 * 1024 * 1024:
+			return _PAGE_SZ4MB_4U;
+		};
+	}
+}
+
+pte_t mk_pte_io(unsigned long page, pgprot_t prot, int space, unsigned long page_size)
+{
+	pte_t pte;
+
+	pte_val(pte)  = page | pgprot_val(pgprot_noncached(prot));
+	pte_val(pte) |= (((unsigned long)space) << 32);
+	pte_val(pte) |= pte_sz_bits(page_size);
+
+	return pte;
+}
+
+static unsigned long kern_large_tte(unsigned long paddr)
+{
+	unsigned long val;
+
+	val = (_PAGE_VALID | _PAGE_SZ4MB_4U |
+	       _PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U |
+	       _PAGE_EXEC_4U | _PAGE_L_4U | _PAGE_W_4U);
+	if (tlb_type == hypervisor)
+		val = (_PAGE_VALID | _PAGE_SZ4MB_4V |
+		       _PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V |
+		       _PAGE_EXEC_4V | _PAGE_W_4V);
+
+	return val | paddr;
+}
+
+/*
+ * Translate PROM's mapping we capture at boot time into physical address.
+ * The second parameter is only set from prom_callback() invocations.
+ */
+unsigned long prom_virt_to_phys(unsigned long promva, int *error)
+{
+	unsigned long mask;
+	int i;
+
+	mask = _PAGE_PADDR_4U;
+	if (tlb_type == hypervisor)
+		mask = _PAGE_PADDR_4V;
+
+	for (i = 0; i < prom_trans_ents; i++) {
+		struct linux_prom_translation *p = &prom_trans[i];
+
+		if (promva >= p->virt &&
+		    promva < (p->virt + p->size)) {
+			unsigned long base = p->data & mask;
+
+			if (error)
+				*error = 0;
+			return base + (promva & (8192 - 1));
+		}
+	}
+	if (error)
+		*error = 1;
+	return 0UL;
+}
+
+/* XXX We should kill off this ugly thing at so me point. XXX */
+unsigned long sun4u_get_pte(unsigned long addr)
+{
+	pgd_t *pgdp;
+	pud_t *pudp;
+	pmd_t *pmdp;
+	pte_t *ptep;
+	unsigned long mask = _PAGE_PADDR_4U;
+
+	if (tlb_type == hypervisor)
+		mask = _PAGE_PADDR_4V;
+
+	if (addr >= PAGE_OFFSET)
+		return addr & mask;
+
+	if ((addr >= LOW_OBP_ADDRESS) && (addr < HI_OBP_ADDRESS))
+		return prom_virt_to_phys(addr, NULL);
+
+	pgdp = pgd_offset_k(addr);
+	pudp = pud_offset(pgdp, addr);
+	pmdp = pmd_offset(pudp, addr);
+	ptep = pte_offset_kernel(pmdp, addr);
+
+	return pte_val(*ptep) & mask;
+}
+
+/* If not locked, zap it. */
+void __flush_tlb_all(void)
+{
+	unsigned long pstate;
+	int i;
+
+	__asm__ __volatile__("flushw\n\t"
+			     "rdpr	%%pstate, %0\n\t"
+			     "wrpr	%0, %1, %%pstate"
+			     : "=r" (pstate)
+			     : "i" (PSTATE_IE));
+	if (tlb_type == spitfire) {
+		for (i = 0; i < 64; i++) {
+			/* Spitfire Errata #32 workaround */
+			/* NOTE: Always runs on spitfire, so no
+			 *       cheetah+ page size encodings.
+			 */
+			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
+					     "flush	%%g6"
+					     : /* No outputs */
+					     : "r" (0),
+					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
+
+			if (!(spitfire_get_dtlb_data(i) & _PAGE_L_4U)) {
+				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
+						     "membar #Sync"
+						     : /* no outputs */
+						     : "r" (TLB_TAG_ACCESS), "i" (ASI_DMMU));
+				spitfire_put_dtlb_data(i, 0x0UL);
+			}
+
+			/* Spitfire Errata #32 workaround */
+			/* NOTE: Always runs on spitfire, so no
+			 *       cheetah+ page size encodings.
+			 */
+			__asm__ __volatile__("stxa	%0, [%1] %2\n\t"
+					     "flush	%%g6"
+					     : /* No outputs */
+					     : "r" (0),
+					     "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
+
+			if (!(spitfire_get_itlb_data(i) & _PAGE_L_4U)) {
+				__asm__ __volatile__("stxa %%g0, [%0] %1\n\t"
+						     "membar #Sync"
+						     : /* no outputs */
+						     : "r" (TLB_TAG_ACCESS), "i" (ASI_IMMU));
+				spitfire_put_itlb_data(i, 0x0UL);
+			}
+		}
+	} else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
+		cheetah_flush_dtlb_all();
+		cheetah_flush_itlb_all();
+	}
+	__asm__ __volatile__("wrpr	%0, 0, %%pstate"
+			     : : "r" (pstate));
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+
+void online_page(struct page *page)
+{
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+	totalram_pages++;
+	num_physpages++;
+}
+
+int remove_memory(u64 start, u64 size)
+{
+	return -EINVAL;
+}
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c
index 8b104be4662b..a079cf42505e 100644
--- a/arch/sparc64/mm/tlb.c
+++ b/arch/sparc64/mm/tlb.c
@@ -25,6 +25,8 @@ void flush_tlb_pending(void)
 	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
 
 	if (mp->tlb_nr) {
+		flush_tsb_user(mp);
+
 		if (CTX_VALID(mp->mm->context)) {
 #ifdef CONFIG_SMP
 			smp_flush_tlb_pending(mp->mm, mp->tlb_nr,
@@ -47,7 +49,8 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t
 	if (pte_exec(orig))
 		vaddr |= 0x1UL;
 
-	if (pte_dirty(orig)) {
+	if (tlb_type != hypervisor &&
+	    pte_dirty(orig)) {
 		unsigned long paddr, pfn = pte_pfn(orig);
 		struct address_space *mapping;
 		struct page *page;
@@ -89,62 +92,3 @@ no_cache_flush:
 	if (nr >= TLB_BATCH_NR)
 		flush_tlb_pending();
 }
-
-void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end)
-{
-	struct mmu_gather *mp = &__get_cpu_var(mmu_gathers);
-	unsigned long nr = mp->tlb_nr;
-	long s = start, e = end, vpte_base;
-
-	if (mp->fullmm)
-		return;
-
-	/* If start is greater than end, that is a real problem.  */
-	BUG_ON(start > end);
-
-	/* However, straddling the VA space hole is quite normal. */
-	s &= PMD_MASK;
-	e = (e + PMD_SIZE - 1) & PMD_MASK;
-
-	vpte_base = (tlb_type == spitfire ?
-		     VPTE_BASE_SPITFIRE :
-		     VPTE_BASE_CHEETAH);
-
-	if (unlikely(nr != 0 && mm != mp->mm)) {
-		flush_tlb_pending();
-		nr = 0;
-	}
-
-	if (nr == 0)
-		mp->mm = mm;
-
-	start = vpte_base + (s >> (PAGE_SHIFT - 3));
-	end = vpte_base + (e >> (PAGE_SHIFT - 3));
-
-	/* If the request straddles the VA space hole, we
-	 * need to swap start and end.  The reason this
-	 * occurs is that "vpte_base" is the center of
-	 * the linear page table mapping area.  Thus,
-	 * high addresses with the sign bit set map to
-	 * addresses below vpte_base and non-sign bit
-	 * addresses map to addresses above vpte_base.
-	 */
-	if (end < start) {
-		unsigned long tmp = start;
-
-		start = end;
-		end = tmp;
-	}
-
-	while (start < end) {
-		mp->vaddrs[nr] = start;
-		mp->tlb_nr = ++nr;
-		if (nr >= TLB_BATCH_NR) {
-			flush_tlb_pending();
-			nr = 0;
-		}
-		start += PAGE_SIZE;
-	}
-	if (nr)
-		flush_tlb_pending();
-}
diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c
new file mode 100644
index 000000000000..beaa02810f0e
--- /dev/null
+++ b/arch/sparc64/mm/tsb.c
@@ -0,0 +1,500 @@
+/* arch/sparc64/mm/tsb.c
+ *
+ * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <asm/system.h>
+#include <asm/page.h>
+#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tsb.h>
+#include <asm/oplib.h>
+
+extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES];
+
+static inline unsigned long tsb_hash(unsigned long vaddr, unsigned long hash_shift, unsigned long nentries)
+{
+	vaddr >>= hash_shift;
+	return vaddr & (nentries - 1);
+}
+
+static inline int tag_compare(unsigned long tag, unsigned long vaddr)
+{
+	return (tag == (vaddr >> 22));
+}
+
+/* TSB flushes need only occur on the processor initiating the address
+ * space modification, not on each cpu the address space has run on.
+ * Only the TLB flush needs that treatment.
+ */
+
+void flush_tsb_kernel_range(unsigned long start, unsigned long end)
+{
+	unsigned long v;
+
+	for (v = start; v < end; v += PAGE_SIZE) {
+		unsigned long hash = tsb_hash(v, PAGE_SHIFT,
+					      KERNEL_TSB_NENTRIES);
+		struct tsb *ent = &swapper_tsb[hash];
+
+		if (tag_compare(ent->tag, v)) {
+			ent->tag = (1UL << TSB_TAG_INVALID_BIT);
+			membar_storeload_storestore();
+		}
+	}
+}
+
+static void __flush_tsb_one(struct mmu_gather *mp, unsigned long hash_shift, unsigned long tsb, unsigned long nentries)
+{
+	unsigned long i;
+
+	for (i = 0; i < mp->tlb_nr; i++) {
+		unsigned long v = mp->vaddrs[i];
+		unsigned long tag, ent, hash;
+
+		v &= ~0x1UL;
+
+		hash = tsb_hash(v, hash_shift, nentries);
+		ent = tsb + (hash * sizeof(struct tsb));
+		tag = (v >> 22UL);
+
+		tsb_flush(ent, tag);
+	}
+}
+
+void flush_tsb_user(struct mmu_gather *mp)
+{
+	struct mm_struct *mm = mp->mm;
+	unsigned long nentries, base, flags;
+
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+	base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb;
+	nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries;
+	if (tlb_type == cheetah_plus || tlb_type == hypervisor)
+		base = __pa(base);
+	__flush_tsb_one(mp, PAGE_SHIFT, base, nentries);
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (mm->context.tsb_block[MM_TSB_HUGE].tsb) {
+		base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb;
+		nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries;
+		if (tlb_type == cheetah_plus || tlb_type == hypervisor)
+			base = __pa(base);
+		__flush_tsb_one(mp, HPAGE_SHIFT, base, nentries);
+	}
+#endif
+	spin_unlock_irqrestore(&mm->context.lock, flags);
+}
+
+#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB)
+#define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_8K
+#define HV_PGSZ_MASK_BASE	HV_PGSZ_MASK_8K
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB)
+#define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_64K
+#define HV_PGSZ_MASK_BASE	HV_PGSZ_MASK_64K
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_512KB)
+#define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_512K
+#define HV_PGSZ_MASK_BASE	HV_PGSZ_MASK_512K
+#elif defined(CONFIG_SPARC64_PAGE_SIZE_4MB)
+#define HV_PGSZ_IDX_BASE	HV_PGSZ_IDX_4MB
+#define HV_PGSZ_MASK_BASE	HV_PGSZ_MASK_4MB
+#else
+#error Broken base page size setting...
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K)
+#define HV_PGSZ_IDX_HUGE	HV_PGSZ_IDX_64K
+#define HV_PGSZ_MASK_HUGE	HV_PGSZ_MASK_64K
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K)
+#define HV_PGSZ_IDX_HUGE	HV_PGSZ_IDX_512K
+#define HV_PGSZ_MASK_HUGE	HV_PGSZ_MASK_512K
+#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB)
+#define HV_PGSZ_IDX_HUGE	HV_PGSZ_IDX_4MB
+#define HV_PGSZ_MASK_HUGE	HV_PGSZ_MASK_4MB
+#else
+#error Broken huge page size setting...
+#endif
+#endif
+
+static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes)
+{
+	unsigned long tsb_reg, base, tsb_paddr;
+	unsigned long page_sz, tte;
+
+	mm->context.tsb_block[tsb_idx].tsb_nentries =
+		tsb_bytes / sizeof(struct tsb);
+
+	base = TSBMAP_BASE;
+	tte = pgprot_val(PAGE_KERNEL_LOCKED);
+	tsb_paddr = __pa(mm->context.tsb_block[tsb_idx].tsb);
+	BUG_ON(tsb_paddr & (tsb_bytes - 1UL));
+
+	/* Use the smallest page size that can map the whole TSB
+	 * in one TLB entry.
+	 */
+	switch (tsb_bytes) {
+	case 8192 << 0:
+		tsb_reg = 0x0UL;
+#ifdef DCACHE_ALIASING_POSSIBLE
+		base += (tsb_paddr & 8192);
+#endif
+		page_sz = 8192;
+		break;
+
+	case 8192 << 1:
+		tsb_reg = 0x1UL;
+		page_sz = 64 * 1024;
+		break;
+
+	case 8192 << 2:
+		tsb_reg = 0x2UL;
+		page_sz = 64 * 1024;
+		break;
+
+	case 8192 << 3:
+		tsb_reg = 0x3UL;
+		page_sz = 64 * 1024;
+		break;
+
+	case 8192 << 4:
+		tsb_reg = 0x4UL;
+		page_sz = 512 * 1024;
+		break;
+
+	case 8192 << 5:
+		tsb_reg = 0x5UL;
+		page_sz = 512 * 1024;
+		break;
+
+	case 8192 << 6:
+		tsb_reg = 0x6UL;
+		page_sz = 512 * 1024;
+		break;
+
+	case 8192 << 7:
+		tsb_reg = 0x7UL;
+		page_sz = 4 * 1024 * 1024;
+		break;
+
+	default:
+		BUG();
+	};
+	tte |= pte_sz_bits(page_sz);
+
+	if (tlb_type == cheetah_plus || tlb_type == hypervisor) {
+		/* Physical mapping, no locked TLB entry for TSB.  */
+		tsb_reg |= tsb_paddr;
+
+		mm->context.tsb_block[tsb_idx].tsb_reg_val = tsb_reg;
+		mm->context.tsb_block[tsb_idx].tsb_map_vaddr = 0;
+		mm->context.tsb_block[tsb_idx].tsb_map_pte = 0;
+	} else {
+		tsb_reg |= base;
+		tsb_reg |= (tsb_paddr & (page_sz - 1UL));
+		tte |= (tsb_paddr & ~(page_sz - 1UL));
+
+		mm->context.tsb_block[tsb_idx].tsb_reg_val = tsb_reg;
+		mm->context.tsb_block[tsb_idx].tsb_map_vaddr = base;
+		mm->context.tsb_block[tsb_idx].tsb_map_pte = tte;
+	}
+
+	/* Setup the Hypervisor TSB descriptor.  */
+	if (tlb_type == hypervisor) {
+		struct hv_tsb_descr *hp = &mm->context.tsb_descr[tsb_idx];
+
+		switch (tsb_idx) {
+		case MM_TSB_BASE:
+			hp->pgsz_idx = HV_PGSZ_IDX_BASE;
+			break;
+#ifdef CONFIG_HUGETLB_PAGE
+		case MM_TSB_HUGE:
+			hp->pgsz_idx = HV_PGSZ_IDX_HUGE;
+			break;
+#endif
+		default:
+			BUG();
+		};
+		hp->assoc = 1;
+		hp->num_ttes = tsb_bytes / 16;
+		hp->ctx_idx = 0;
+		switch (tsb_idx) {
+		case MM_TSB_BASE:
+			hp->pgsz_mask = HV_PGSZ_MASK_BASE;
+			break;
+#ifdef CONFIG_HUGETLB_PAGE
+		case MM_TSB_HUGE:
+			hp->pgsz_mask = HV_PGSZ_MASK_HUGE;
+			break;
+#endif
+		default:
+			BUG();
+		};
+		hp->tsb_base = tsb_paddr;
+		hp->resv = 0;
+	}
+}
+
+static kmem_cache_t *tsb_caches[8] __read_mostly;
+
+static const char *tsb_cache_names[8] = {
+	"tsb_8KB",
+	"tsb_16KB",
+	"tsb_32KB",
+	"tsb_64KB",
+	"tsb_128KB",
+	"tsb_256KB",
+	"tsb_512KB",
+	"tsb_1MB",
+};
+
+void __init tsb_cache_init(void)
+{
+	unsigned long i;
+
+	for (i = 0; i < 8; i++) {
+		unsigned long size = 8192 << i;
+		const char *name = tsb_cache_names[i];
+
+		tsb_caches[i] = kmem_cache_create(name,
+						  size, size,
+						  SLAB_HWCACHE_ALIGN |
+						  SLAB_MUST_HWCACHE_ALIGN,
+						  NULL, NULL);
+		if (!tsb_caches[i]) {
+			prom_printf("Could not create %s cache\n", name);
+			prom_halt();
+		}
+	}
+}
+
+/* When the RSS of an address space exceeds tsb_rss_limit for a TSB,
+ * do_sparc64_fault() invokes this routine to try and grow it.
+ *
+ * When we reach the maximum TSB size supported, we stick ~0UL into
+ * tsb_rss_limit for that TSB so the grow checks in do_sparc64_fault()
+ * will not trigger any longer.
+ *
+ * The TSB can be anywhere from 8K to 1MB in size, in increasing powers
+ * of two.  The TSB must be aligned to it's size, so f.e. a 512K TSB
+ * must be 512K aligned.  It also must be physically contiguous, so we
+ * cannot use vmalloc().
+ *
+ * The idea here is to grow the TSB when the RSS of the process approaches
+ * the number of entries that the current TSB can hold at once.  Currently,
+ * we trigger when the RSS hits 3/4 of the TSB capacity.
+ */
+void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss)
+{
+	unsigned long max_tsb_size = 1 * 1024 * 1024;
+	unsigned long new_size, old_size, flags;
+	struct tsb *old_tsb, *new_tsb;
+	unsigned long new_cache_index, old_cache_index;
+	unsigned long new_rss_limit;
+	gfp_t gfp_flags;
+
+	if (max_tsb_size > (PAGE_SIZE << MAX_ORDER))
+		max_tsb_size = (PAGE_SIZE << MAX_ORDER);
+
+	new_cache_index = 0;
+	for (new_size = 8192; new_size < max_tsb_size; new_size <<= 1UL) {
+		unsigned long n_entries = new_size / sizeof(struct tsb);
+
+		n_entries = (n_entries * 3) / 4;
+		if (n_entries > rss)
+			break;
+
+		new_cache_index++;
+	}
+
+	if (new_size == max_tsb_size)
+		new_rss_limit = ~0UL;
+	else
+		new_rss_limit = ((new_size / sizeof(struct tsb)) * 3) / 4;
+
+retry_tsb_alloc:
+	gfp_flags = GFP_KERNEL;
+	if (new_size > (PAGE_SIZE * 2))
+		gfp_flags = __GFP_NOWARN | __GFP_NORETRY;
+
+	new_tsb = kmem_cache_alloc(tsb_caches[new_cache_index], gfp_flags);
+	if (unlikely(!new_tsb)) {
+		/* Not being able to fork due to a high-order TSB
+		 * allocation failure is very bad behavior.  Just back
+		 * down to a 0-order allocation and force no TSB
+		 * growing for this address space.
+		 */
+		if (mm->context.tsb_block[tsb_index].tsb == NULL &&
+		    new_cache_index > 0) {
+			new_cache_index = 0;
+			new_size = 8192;
+			new_rss_limit = ~0UL;
+			goto retry_tsb_alloc;
+		}
+
+		/* If we failed on a TSB grow, we are under serious
+		 * memory pressure so don't try to grow any more.
+		 */
+		if (mm->context.tsb_block[tsb_index].tsb != NULL)
+			mm->context.tsb_block[tsb_index].tsb_rss_limit = ~0UL;
+		return;
+	}
+
+	/* Mark all tags as invalid.  */
+	tsb_init(new_tsb, new_size);
+
+	/* Ok, we are about to commit the changes.  If we are
+	 * growing an existing TSB the locking is very tricky,
+	 * so WATCH OUT!
+	 *
+	 * We have to hold mm->context.lock while committing to the
+	 * new TSB, this synchronizes us with processors in
+	 * flush_tsb_user() and switch_mm() for this address space.
+	 *
+	 * But even with that lock held, processors run asynchronously
+	 * accessing the old TSB via TLB miss handling.  This is OK
+	 * because those actions are just propagating state from the
+	 * Linux page tables into the TSB, page table mappings are not
+	 * being changed.  If a real fault occurs, the processor will
+	 * synchronize with us when it hits flush_tsb_user(), this is
+	 * also true for the case where vmscan is modifying the page
+	 * tables.  The only thing we need to be careful with is to
+	 * skip any locked TSB entries during copy_tsb().
+	 *
+	 * When we finish committing to the new TSB, we have to drop
+	 * the lock and ask all other cpus running this address space
+	 * to run tsb_context_switch() to see the new TSB table.
+	 */
+	spin_lock_irqsave(&mm->context.lock, flags);
+
+	old_tsb = mm->context.tsb_block[tsb_index].tsb;
+	old_cache_index =
+		(mm->context.tsb_block[tsb_index].tsb_reg_val & 0x7UL);
+	old_size = (mm->context.tsb_block[tsb_index].tsb_nentries *
+		    sizeof(struct tsb));
+
+
+	/* Handle multiple threads trying to grow the TSB at the same time.
+	 * One will get in here first, and bump the size and the RSS limit.
+	 * The others will get in here next and hit this check.
+	 */
+	if (unlikely(old_tsb &&
+		     (rss < mm->context.tsb_block[tsb_index].tsb_rss_limit))) {
+		spin_unlock_irqrestore(&mm->context.lock, flags);
+
+		kmem_cache_free(tsb_caches[new_cache_index], new_tsb);
+		return;
+	}
+
+	mm->context.tsb_block[tsb_index].tsb_rss_limit = new_rss_limit;
+
+	if (old_tsb) {
+		extern void copy_tsb(unsigned long old_tsb_base,
+				     unsigned long old_tsb_size,
+				     unsigned long new_tsb_base,
+				     unsigned long new_tsb_size);
+		unsigned long old_tsb_base = (unsigned long) old_tsb;
+		unsigned long new_tsb_base = (unsigned long) new_tsb;
+
+		if (tlb_type == cheetah_plus || tlb_type == hypervisor) {
+			old_tsb_base = __pa(old_tsb_base);
+			new_tsb_base = __pa(new_tsb_base);
+		}
+		copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size);
+	}
+
+	mm->context.tsb_block[tsb_index].tsb = new_tsb;
+	setup_tsb_params(mm, tsb_index, new_size);
+
+	spin_unlock_irqrestore(&mm->context.lock, flags);
+
+	/* If old_tsb is NULL, we're being invoked for the first time
+	 * from init_new_context().
+	 */
+	if (old_tsb) {
+		/* Reload it on the local cpu.  */
+		tsb_context_switch(mm);
+
+		/* Now force other processors to do the same.  */
+		smp_tsb_sync(mm);
+
+		/* Now it is safe to free the old tsb.  */
+		kmem_cache_free(tsb_caches[old_cache_index], old_tsb);
+	}
+}
+
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	unsigned long huge_pte_count;
+#endif
+	unsigned int i;
+
+	spin_lock_init(&mm->context.lock);
+
+	mm->context.sparc64_ctx_val = 0UL;
+
+#ifdef CONFIG_HUGETLB_PAGE
+	/* We reset it to zero because the fork() page copying
+	 * will re-increment the counters as the parent PTEs are
+	 * copied into the child address space.
+	 */
+	huge_pte_count = mm->context.huge_pte_count;
+	mm->context.huge_pte_count = 0;
+#endif
+
+	/* copy_mm() copies over the parent's mm_struct before calling
+	 * us, so we need to zero out the TSB pointer or else tsb_grow()
+	 * will be confused and think there is an older TSB to free up.
+	 */
+	for (i = 0; i < MM_NUM_TSBS; i++)
+		mm->context.tsb_block[i].tsb = NULL;
+
+	/* If this is fork, inherit the parent's TSB size.  We would
+	 * grow it to that size on the first page fault anyways.
+	 */
+	tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm));
+
+#ifdef CONFIG_HUGETLB_PAGE
+	if (unlikely(huge_pte_count))
+		tsb_grow(mm, MM_TSB_HUGE, huge_pte_count);
+#endif
+
+	if (unlikely(!mm->context.tsb_block[MM_TSB_BASE].tsb))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void tsb_destroy_one(struct tsb_config *tp)
+{
+	unsigned long cache_index;
+
+	if (!tp->tsb)
+		return;
+	cache_index = tp->tsb_reg_val & 0x7UL;
+	kmem_cache_free(tsb_caches[cache_index], tp->tsb);
+	tp->tsb = NULL;
+	tp->tsb_reg_val = 0UL;
+}
+
+void destroy_context(struct mm_struct *mm)
+{
+	unsigned long flags, i;
+
+	for (i = 0; i < MM_NUM_TSBS; i++)
+		tsb_destroy_one(&mm->context.tsb_block[i]);
+
+	spin_lock_irqsave(&ctx_alloc_lock, flags);
+
+	if (CTX_VALID(mm->context)) {
+		unsigned long nr = CTX_NRBITS(mm->context);
+		mmu_context_bmap[nr>>6] &= ~(1UL << (nr & 63));
+	}
+
+	spin_unlock_irqrestore(&ctx_alloc_lock, flags);
+}
diff --git a/arch/sparc64/mm/ultra.S b/arch/sparc64/mm/ultra.S
index e4c9151fa116..f8479fad4047 100644
--- a/arch/sparc64/mm/ultra.S
+++ b/arch/sparc64/mm/ultra.S
@@ -15,6 +15,7 @@
 #include <asm/head.h>
 #include <asm/thread_info.h>
 #include <asm/cacheflush.h>
+#include <asm/hypervisor.h>
 
 	/* Basically, most of the Spitfire vs. Cheetah madness
 	 * has to do with the fact that Cheetah does not support
@@ -29,16 +30,18 @@
 	.text
 	.align		32
 	.globl		__flush_tlb_mm
-__flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
+__flush_tlb_mm:		/* 18 insns */
+	/* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
 	ldxa		[%o1] ASI_DMMU, %g2
 	cmp		%g2, %o0
 	bne,pn		%icc, __spitfire_flush_tlb_mm_slow
 	 mov		0x50, %g3
 	stxa		%g0, [%g3] ASI_DMMU_DEMAP
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
+	sethi		%hi(KERNBASE), %g3
+	flush		%g3
 	retl
-	 flush		%g6
-	nop
+	 nop
 	nop
 	nop
 	nop
@@ -51,7 +54,7 @@ __flush_tlb_mm: /* %o0=(ctx & TAG_CONTEXT_BITS), %o1=SECONDARY_CONTEXT */
 
 	.align		32
 	.globl		__flush_tlb_pending
-__flush_tlb_pending:
+__flush_tlb_pending:	/* 26 insns */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	rdpr		%pstate, %g7
 	sllx		%o1, 3, %o1
@@ -72,7 +75,8 @@ __flush_tlb_pending:
 	brnz,pt		%o1, 1b
 	 nop
 	stxa		%g2, [%o4] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o4
+	flush		%o4
 	retl
 	 wrpr		%g7, 0x0, %pstate
 	nop
@@ -82,7 +86,8 @@ __flush_tlb_pending:
 
 	.align		32
 	.globl		__flush_tlb_kernel_range
-__flush_tlb_kernel_range:	/* %o0=start, %o1=end */
+__flush_tlb_kernel_range:	/* 16 insns */
+	/* %o0=start, %o1=end */
 	cmp		%o0, %o1
 	be,pn		%xcc, 2f
 	 sethi		%hi(PAGE_SIZE), %o4
@@ -94,8 +99,11 @@ __flush_tlb_kernel_range:	/* %o0=start, %o1=end */
 	membar		#Sync
 	brnz,pt		%o3, 1b
 	 sub		%o3, %o4, %o3
-2:	retl
-	 flush		%g6
+2:	sethi		%hi(KERNBASE), %o3
+	flush		%o3
+	retl
+	 nop
+	nop
 
 __spitfire_flush_tlb_mm_slow:
 	rdpr		%pstate, %g1
@@ -105,7 +113,8 @@ __spitfire_flush_tlb_mm_slow:
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
 	flush		%g6
 	stxa		%g2, [%o1] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o1
+	flush		%o1
 	retl
 	 wrpr		%g1, 0, %pstate
 
@@ -181,7 +190,7 @@ __flush_dcache_page:	/* %o0=kaddr, %o1=flush_icache */
 	.previous
 
 	/* Cheetah specific versions, patched at boot time. */
-__cheetah_flush_tlb_mm: /* 18 insns */
+__cheetah_flush_tlb_mm: /* 19 insns */
 	rdpr		%pstate, %g7
 	andn		%g7, PSTATE_IE, %g2
 	wrpr		%g2, 0x0, %pstate
@@ -196,12 +205,13 @@ __cheetah_flush_tlb_mm: /* 18 insns */
 	stxa		%g0, [%g3] ASI_DMMU_DEMAP
 	stxa		%g0, [%g3] ASI_IMMU_DEMAP
 	stxa		%g2, [%o2] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o2
+	flush		%o2
 	wrpr		%g0, 0, %tl
 	retl
 	 wrpr		%g7, 0x0, %pstate
 
-__cheetah_flush_tlb_pending:	/* 26 insns */
+__cheetah_flush_tlb_pending:	/* 27 insns */
 	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
 	rdpr		%pstate, %g7
 	sllx		%o1, 3, %o1
@@ -225,7 +235,8 @@ __cheetah_flush_tlb_pending:	/* 26 insns */
 	brnz,pt		%o1, 1b
 	 nop
 	stxa		%g2, [%o4] ASI_DMMU
-	flush		%g6
+	sethi		%hi(KERNBASE), %o4
+	flush		%o4
 	wrpr		%g0, 0, %tl
 	retl
 	 wrpr		%g7, 0x0, %pstate
@@ -245,7 +256,76 @@ __cheetah_flush_dcache_page: /* 11 insns */
 	 nop
 #endif /* DCACHE_ALIASING_POSSIBLE */
 
-cheetah_patch_one:
+	/* Hypervisor specific versions, patched at boot time.  */
+__hypervisor_tlb_tl0_error:
+	save		%sp, -192, %sp
+	mov		%i0, %o0
+	call		hypervisor_tlbop_error
+	 mov		%i1, %o1
+	ret
+	 restore
+
+__hypervisor_flush_tlb_mm: /* 10 insns */
+	mov		%o0, %o2	/* ARG2: mmu context */
+	mov		0, %o0		/* ARG0: CPU lists unimplemented */
+	mov		0, %o1		/* ARG1: CPU lists unimplemented */
+	mov		HV_MMU_ALL, %o3	/* ARG3: flags */
+	mov		HV_FAST_MMU_DEMAP_CTX, %o5
+	ta		HV_FAST_TRAP
+	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	 mov		HV_FAST_MMU_DEMAP_CTX, %o1
+	retl
+	 nop
+
+__hypervisor_flush_tlb_pending: /* 16 insns */
+	/* %o0 = context, %o1 = nr, %o2 = vaddrs[] */
+	sllx		%o1, 3, %g1
+	mov		%o2, %g2
+	mov		%o0, %g3
+1:	sub		%g1, (1 << 3), %g1
+	ldx		[%g2 + %g1], %o0      /* ARG0: vaddr + IMMU-bit */
+	mov		%g3, %o1	      /* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2	      /* ARG2: flags */
+	srlx		%o0, PAGE_SHIFT, %o0
+	sllx		%o0, PAGE_SHIFT, %o0
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
+	brnz,pt		%g1, 1b
+	 nop
+	retl
+	 nop
+
+__hypervisor_flush_tlb_kernel_range: /* 16 insns */
+	/* %o0=start, %o1=end */
+	cmp		%o0, %o1
+	be,pn		%xcc, 2f
+	 sethi		%hi(PAGE_SIZE), %g3
+	mov		%o0, %g1
+	sub		%o1, %g1, %g2
+	sub		%g2, %g3, %g2
+1:	add		%g1, %g2, %o0	/* ARG0: virtual address */
+	mov		0, %o1		/* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2	/* ARG2: flags */
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	brnz,pn		%o0, __hypervisor_tlb_tl0_error
+	 mov		HV_MMU_UNMAP_ADDR_TRAP, %o1
+	brnz,pt		%g2, 1b
+	 sub		%g2, %g3, %g2
+2:	retl
+	 nop
+
+#ifdef DCACHE_ALIASING_POSSIBLE
+	/* XXX Niagara and friends have an 8K cache, so no aliasing is
+	 * XXX possible, but nothing explicit in the Hypervisor API
+	 * XXX guarantees this.
+	 */
+__hypervisor_flush_dcache_page:	/* 2 insns */
+	retl
+	 nop
+#endif
+
+tlb_patch_one:
 1:	lduw		[%o1], %g1
 	stw		%g1, [%o0]
 	flush		%o0
@@ -264,22 +344,22 @@ cheetah_patch_cachetlbops:
 	or		%o0, %lo(__flush_tlb_mm), %o0
 	sethi		%hi(__cheetah_flush_tlb_mm), %o1
 	or		%o1, %lo(__cheetah_flush_tlb_mm), %o1
-	call		cheetah_patch_one
-	 mov		18, %o2
+	call		tlb_patch_one
+	 mov		19, %o2
 
 	sethi		%hi(__flush_tlb_pending), %o0
 	or		%o0, %lo(__flush_tlb_pending), %o0
 	sethi		%hi(__cheetah_flush_tlb_pending), %o1
 	or		%o1, %lo(__cheetah_flush_tlb_pending), %o1
-	call		cheetah_patch_one
-	 mov		26, %o2
+	call		tlb_patch_one
+	 mov		27, %o2
 
 #ifdef DCACHE_ALIASING_POSSIBLE
 	sethi		%hi(__flush_dcache_page), %o0
 	or		%o0, %lo(__flush_dcache_page), %o0
 	sethi		%hi(__cheetah_flush_dcache_page), %o1
 	or		%o1, %lo(__cheetah_flush_dcache_page), %o1
-	call		cheetah_patch_one
+	call		tlb_patch_one
 	 mov		11, %o2
 #endif /* DCACHE_ALIASING_POSSIBLE */
 
@@ -295,16 +375,14 @@ cheetah_patch_cachetlbops:
 	 *   %g1	address arg 1	(tlb page and range flushes)
 	 *   %g7	address arg 2	(tlb range flush only)
 	 *
-	 *   %g6	ivector table, don't touch
-	 *   %g2	scratch 1
-	 *   %g3	scratch 2
-	 *   %g4	scratch 3
-	 *
-	 * TODO: Make xcall TLB range flushes use the tricks above... -DaveM
+	 *   %g6	scratch 1
+	 *   %g2	scratch 2
+	 *   %g3	scratch 3
+	 *   %g4	scratch 4
 	 */
 	.align		32
 	.globl		xcall_flush_tlb_mm
-xcall_flush_tlb_mm:
+xcall_flush_tlb_mm:	/* 21 insns */
 	mov		PRIMARY_CONTEXT, %g2
 	ldxa		[%g2] ASI_DMMU, %g3
 	srlx		%g3, CTX_PGSZ1_NUC_SHIFT, %g4
@@ -316,9 +394,19 @@ xcall_flush_tlb_mm:
 	stxa		%g0, [%g4] ASI_IMMU_DEMAP
 	stxa		%g3, [%g2] ASI_DMMU
 	retry
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
 	.globl		xcall_flush_tlb_pending
-xcall_flush_tlb_pending:
+xcall_flush_tlb_pending:	/* 21 insns */
 	/* %g5=context, %g1=nr, %g7=vaddrs[] */
 	sllx		%g1, 3, %g1
 	mov		PRIMARY_CONTEXT, %g4
@@ -341,9 +429,10 @@ xcall_flush_tlb_pending:
 	 nop
 	stxa		%g2, [%g4] ASI_DMMU
 	retry
+	nop
 
 	.globl		xcall_flush_tlb_kernel_range
-xcall_flush_tlb_kernel_range:
+xcall_flush_tlb_kernel_range:	/* 25 insns */
 	sethi		%hi(PAGE_SIZE - 1), %g2
 	or		%g2, %lo(PAGE_SIZE - 1), %g2
 	andn		%g1, %g2, %g1
@@ -360,14 +449,30 @@ xcall_flush_tlb_kernel_range:
 	retry
 	nop
 	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
+	nop
 
 	/* This runs in a very controlled environment, so we do
 	 * not need to worry about BH races etc.
 	 */
 	.globl		xcall_sync_tick
 xcall_sync_tick:
-	rdpr		%pstate, %g2
+
+661:	rdpr		%pstate, %g2
 	wrpr		%g2, PSTATE_IG | PSTATE_AG, %pstate
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
 	rdpr		%pil, %g2
 	wrpr		%g0, 15, %pil
 	sethi		%hi(109f), %g7
@@ -390,8 +495,15 @@ xcall_sync_tick:
 	 */
 	.globl		xcall_report_regs
 xcall_report_regs:
-	rdpr		%pstate, %g2
+
+661:	rdpr		%pstate, %g2
 	wrpr		%g2, PSTATE_IG | PSTATE_AG, %pstate
+	.section	.sun4v_2insn_patch, "ax"
+	.word		661b
+	nop
+	nop
+	.previous
+
 	rdpr		%pil, %g2
 	wrpr		%g0, 15, %pil
 	sethi		%hi(109f), %g7
@@ -453,62 +565,96 @@ xcall_flush_dcache_page_spitfire: /* %g1 == physical page address
 	nop
 	nop
 
-	.data
-
-errata32_hwbug:
-	.xword	0
-
-	.text
-
-	/* These two are not performance critical... */
-	.globl		xcall_flush_tlb_all_spitfire
-xcall_flush_tlb_all_spitfire:
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-	clr		%g2
-	clr		%g3
-1:	ldxa		[%g3] ASI_DTLB_DATA_ACCESS, %g4
-	and		%g4, _PAGE_L, %g5
-	brnz,pn		%g5, 2f
-	 mov		TLB_TAG_ACCESS, %g7
-
-	stxa		%g0, [%g7] ASI_DMMU
-	membar		#Sync
-	stxa		%g0, [%g3] ASI_DTLB_DATA_ACCESS
+	/* %g5:	error
+	 * %g6:	tlb op
+	 */
+__hypervisor_tlb_xcall_error:
+	mov	%g5, %g4
+	mov	%g6, %g5
+	ba,pt	%xcc, etrap
+	 rd	%pc, %g7
+	mov	%l4, %o0
+	call	hypervisor_tlbop_error_xcall
+	 mov	%l5, %o1
+	ba,a,pt	%xcc, rtrap_clr_l6
+
+	.globl		__hypervisor_xcall_flush_tlb_mm
+__hypervisor_xcall_flush_tlb_mm: /* 21 insns */
+	/* %g5=ctx, g1,g2,g3,g4,g7=scratch, %g6=unusable */
+	mov		%o0, %g2
+	mov		%o1, %g3
+	mov		%o2, %g4
+	mov		%o3, %g1
+	mov		%o5, %g7
+	clr		%o0		/* ARG0: CPU lists unimplemented */
+	clr		%o1		/* ARG1: CPU lists unimplemented */
+	mov		%g5, %o2	/* ARG2: mmu context */
+	mov		HV_MMU_ALL, %o3	/* ARG3: flags */
+	mov		HV_FAST_MMU_DEMAP_CTX, %o5
+	ta		HV_FAST_TRAP
+	mov		HV_FAST_MMU_DEMAP_CTX, %g6
+	brnz,pn		%o0, __hypervisor_tlb_xcall_error
+	 mov		%o0, %g5
+	mov		%g2, %o0
+	mov		%g3, %o1
+	mov		%g4, %o2
+	mov		%g1, %o3
+	mov		%g7, %o5
 	membar		#Sync
+	retry
 
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-2:	ldxa		[%g3] ASI_ITLB_DATA_ACCESS, %g4
-	and		%g4, _PAGE_L, %g5
-	brnz,pn		%g5, 2f
-	 mov		TLB_TAG_ACCESS, %g7
-
-	stxa		%g0, [%g7] ASI_IMMU
-	membar		#Sync
-	stxa		%g0, [%g3] ASI_ITLB_DATA_ACCESS
+	.globl		__hypervisor_xcall_flush_tlb_pending
+__hypervisor_xcall_flush_tlb_pending: /* 21 insns */
+	/* %g5=ctx, %g1=nr, %g7=vaddrs[], %g2,%g3,%g4,g6=scratch */
+	sllx		%g1, 3, %g1
+	mov		%o0, %g2
+	mov		%o1, %g3
+	mov		%o2, %g4
+1:	sub		%g1, (1 << 3), %g1
+	ldx		[%g7 + %g1], %o0	/* ARG0: virtual address */
+	mov		%g5, %o1		/* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2		/* ARG2: flags */
+	srlx		%o0, PAGE_SHIFT, %o0
+	sllx		%o0, PAGE_SHIFT, %o0
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	mov		HV_MMU_UNMAP_ADDR_TRAP, %g6
+	brnz,a,pn	%o0, __hypervisor_tlb_xcall_error
+	 mov		%o0, %g5
+	brnz,pt		%g1, 1b
+	 nop
+	mov		%g2, %o0
+	mov		%g3, %o1
+	mov		%g4, %o2
 	membar		#Sync
-
-	/* Spitfire Errata #32 workaround. */
-	sethi		%hi(errata32_hwbug), %g4
-	stx		%g0, [%g4 + %lo(errata32_hwbug)]
-
-2:	add		%g2, 1, %g2
-	cmp		%g2, SPITFIRE_HIGHEST_LOCKED_TLBENT
-	ble,pt		%icc, 1b
-	 sll		%g2, 3, %g3
-	flush		%g6
 	retry
 
-	.globl		xcall_flush_tlb_all_cheetah
-xcall_flush_tlb_all_cheetah:
-	mov		0x80, %g2
-	stxa		%g0, [%g2] ASI_DMMU_DEMAP
-	stxa		%g0, [%g2] ASI_IMMU_DEMAP
+	.globl		__hypervisor_xcall_flush_tlb_kernel_range
+__hypervisor_xcall_flush_tlb_kernel_range: /* 25 insns */
+	/* %g1=start, %g7=end, g2,g3,g4,g5,g6=scratch */
+	sethi		%hi(PAGE_SIZE - 1), %g2
+	or		%g2, %lo(PAGE_SIZE - 1), %g2
+	andn		%g1, %g2, %g1
+	andn		%g7, %g2, %g7
+	sub		%g7, %g1, %g3
+	add		%g2, 1, %g2
+	sub		%g3, %g2, %g3
+	mov		%o0, %g2
+	mov		%o1, %g4
+	mov		%o2, %g7
+1:	add		%g1, %g3, %o0	/* ARG0: virtual address */
+	mov		0, %o1		/* ARG1: mmu context */
+	mov		HV_MMU_ALL, %o2	/* ARG2: flags */
+	ta		HV_MMU_UNMAP_ADDR_TRAP
+	mov		HV_MMU_UNMAP_ADDR_TRAP, %g6
+	brnz,pn		%o0, __hypervisor_tlb_xcall_error
+	 mov		%o0, %g5
+	sethi		%hi(PAGE_SIZE), %o2
+	brnz,pt		%g3, 1b
+	 sub		%g3, %o2, %g3
+	mov		%g2, %o0
+	mov		%g4, %o1
+	mov		%g7, %o2
+	membar		#Sync
 	retry
 
 	/* These just get rescheduled to PIL vectors. */
@@ -527,4 +673,70 @@ xcall_capture:
 	wr		%g0, (1 << PIL_SMP_CAPTURE), %set_softint
 	retry
 
+	.globl		xcall_new_mmu_context_version
+xcall_new_mmu_context_version:
+	wr		%g0, (1 << PIL_SMP_CTX_NEW_VERSION), %set_softint
+	retry
+
 #endif /* CONFIG_SMP */
+
+
+	.globl		hypervisor_patch_cachetlbops
+hypervisor_patch_cachetlbops:
+	save		%sp, -128, %sp
+
+	sethi		%hi(__flush_tlb_mm), %o0
+	or		%o0, %lo(__flush_tlb_mm), %o0
+	sethi		%hi(__hypervisor_flush_tlb_mm), %o1
+	or		%o1, %lo(__hypervisor_flush_tlb_mm), %o1
+	call		tlb_patch_one
+	 mov		10, %o2
+
+	sethi		%hi(__flush_tlb_pending), %o0
+	or		%o0, %lo(__flush_tlb_pending), %o0
+	sethi		%hi(__hypervisor_flush_tlb_pending), %o1
+	or		%o1, %lo(__hypervisor_flush_tlb_pending), %o1
+	call		tlb_patch_one
+	 mov		16, %o2
+
+	sethi		%hi(__flush_tlb_kernel_range), %o0
+	or		%o0, %lo(__flush_tlb_kernel_range), %o0
+	sethi		%hi(__hypervisor_flush_tlb_kernel_range), %o1
+	or		%o1, %lo(__hypervisor_flush_tlb_kernel_range), %o1
+	call		tlb_patch_one
+	 mov		16, %o2
+
+#ifdef DCACHE_ALIASING_POSSIBLE
+	sethi		%hi(__flush_dcache_page), %o0
+	or		%o0, %lo(__flush_dcache_page), %o0
+	sethi		%hi(__hypervisor_flush_dcache_page), %o1
+	or		%o1, %lo(__hypervisor_flush_dcache_page), %o1
+	call		tlb_patch_one
+	 mov		2, %o2
+#endif /* DCACHE_ALIASING_POSSIBLE */
+
+#ifdef CONFIG_SMP
+	sethi		%hi(xcall_flush_tlb_mm), %o0
+	or		%o0, %lo(xcall_flush_tlb_mm), %o0
+	sethi		%hi(__hypervisor_xcall_flush_tlb_mm), %o1
+	or		%o1, %lo(__hypervisor_xcall_flush_tlb_mm), %o1
+	call		tlb_patch_one
+	 mov		21, %o2
+
+	sethi		%hi(xcall_flush_tlb_pending), %o0
+	or		%o0, %lo(xcall_flush_tlb_pending), %o0
+	sethi		%hi(__hypervisor_xcall_flush_tlb_pending), %o1
+	or		%o1, %lo(__hypervisor_xcall_flush_tlb_pending), %o1
+	call		tlb_patch_one
+	 mov		21, %o2
+
+	sethi		%hi(xcall_flush_tlb_kernel_range), %o0
+	or		%o0, %lo(xcall_flush_tlb_kernel_range), %o0
+	sethi		%hi(__hypervisor_xcall_flush_tlb_kernel_range), %o1
+	or		%o1, %lo(__hypervisor_xcall_flush_tlb_kernel_range), %o1
+	call		tlb_patch_one
+	 mov		25, %o2
+#endif /* CONFIG_SMP */
+
+	ret
+	 restore
diff --git a/arch/sparc64/prom/cif.S b/arch/sparc64/prom/cif.S
index 29d0ae74aed8..5f27ad779c0c 100644
--- a/arch/sparc64/prom/cif.S
+++ b/arch/sparc64/prom/cif.S
@@ -1,10 +1,12 @@
 /* cif.S: PROM entry/exit assembler trampolines.
  *
- * Copyright (C) 1996,1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
- * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
+ * Copyright (C) 1996, 1997 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
+ * Copyright (C) 2005, 2006 David S. Miller <davem@davemloft.net>
  */
 
 #include <asm/pstate.h>
+#include <asm/cpudata.h>
+#include <asm/thread_info.h>
 
 	.text
 	.globl	prom_cif_interface
@@ -12,78 +14,16 @@ prom_cif_interface:
 	sethi	%hi(p1275buf), %o0
 	or	%o0, %lo(p1275buf), %o0
 	ldx	[%o0 + 0x010], %o1	! prom_cif_stack
-	save	%o1, -0x190, %sp
+	save	%o1, -192, %sp
 	ldx	[%i0 + 0x008], %l2	! prom_cif_handler
-	rdpr	%pstate, %l4
-	wrpr	%g0, 0x15, %pstate	! save alternate globals
-	stx	%g1, [%sp + 2047 + 0x0b0]
-	stx	%g2, [%sp + 2047 + 0x0b8]
-	stx	%g3, [%sp + 2047 + 0x0c0]
-	stx	%g4, [%sp + 2047 + 0x0c8]
-	stx	%g5, [%sp + 2047 + 0x0d0]
-	stx	%g6, [%sp + 2047 + 0x0d8]
-	stx	%g7, [%sp + 2047 + 0x0e0]
-	wrpr	%g0, 0x814, %pstate	! save interrupt globals
-	stx	%g1, [%sp + 2047 + 0x0e8]
-	stx	%g2, [%sp + 2047 + 0x0f0]
-	stx	%g3, [%sp + 2047 + 0x0f8]
-	stx	%g4, [%sp + 2047 + 0x100]
-	stx	%g5, [%sp + 2047 + 0x108]
-	stx	%g6, [%sp + 2047 + 0x110]
-	stx	%g7, [%sp + 2047 + 0x118]
-	wrpr	%g0, 0x14, %pstate	! save normal globals
-	stx	%g1, [%sp + 2047 + 0x120]
-	stx	%g2, [%sp + 2047 + 0x128]
-	stx	%g3, [%sp + 2047 + 0x130]
-	stx	%g4, [%sp + 2047 + 0x138]
-	stx	%g5, [%sp + 2047 + 0x140]
-	stx	%g6, [%sp + 2047 + 0x148]
-	stx	%g7, [%sp + 2047 + 0x150]
-	wrpr	%g0, 0x414, %pstate	! save mmu globals
-	stx	%g1, [%sp + 2047 + 0x158]
-	stx	%g2, [%sp + 2047 + 0x160]
-	stx	%g3, [%sp + 2047 + 0x168]
-	stx	%g4, [%sp + 2047 + 0x170]
-	stx	%g5, [%sp + 2047 + 0x178]
-	stx	%g6, [%sp + 2047 + 0x180]
-	stx	%g7, [%sp + 2047 + 0x188]
-	mov	%g1, %l0		! also save to locals, so we can handle
-	mov	%g2, %l1		! tlb faults later on, when accessing
-	mov	%g3, %l3		! the stack.
-	mov	%g7, %l5
-	wrpr	%l4, PSTATE_IE, %pstate	! turn off interrupts
+	mov	%g4, %l0
+	mov	%g5, %l1
+	mov	%g6, %l3
 	call	%l2
 	 add	%i0, 0x018, %o0		! prom_args
-	wrpr	%g0, 0x414, %pstate	! restore mmu globals
-	mov	%l0, %g1
-	mov	%l1, %g2
-	mov	%l3, %g3
-	mov	%l5, %g7
-	wrpr	%g0, 0x14, %pstate	! restore normal globals
-	ldx	[%sp + 2047 + 0x120], %g1
-	ldx	[%sp + 2047 + 0x128], %g2
-	ldx	[%sp + 2047 + 0x130], %g3
-	ldx	[%sp + 2047 + 0x138], %g4
-	ldx	[%sp + 2047 + 0x140], %g5
-	ldx	[%sp + 2047 + 0x148], %g6
-	ldx	[%sp + 2047 + 0x150], %g7
-	wrpr	%g0, 0x814, %pstate	! restore interrupt globals
-	ldx	[%sp + 2047 + 0x0e8], %g1
-	ldx	[%sp + 2047 + 0x0f0], %g2
-	ldx	[%sp + 2047 + 0x0f8], %g3
-	ldx	[%sp + 2047 + 0x100], %g4
-	ldx	[%sp + 2047 + 0x108], %g5
-	ldx	[%sp + 2047 + 0x110], %g6
-	ldx	[%sp + 2047 + 0x118], %g7
-	wrpr	%g0, 0x15, %pstate	! restore alternate globals
-	ldx	[%sp + 2047 + 0x0b0], %g1
-	ldx	[%sp + 2047 + 0x0b8], %g2
-	ldx	[%sp + 2047 + 0x0c0], %g3
-	ldx	[%sp + 2047 + 0x0c8], %g4
-	ldx	[%sp + 2047 + 0x0d0], %g5
-	ldx	[%sp + 2047 + 0x0d8], %g6
-	ldx	[%sp + 2047 + 0x0e0], %g7
-	wrpr	%l4, 0, %pstate	! restore original pstate
+	mov	%l0, %g4
+	mov	%l1, %g5
+	mov	%l3, %g6
 	ret
 	 restore
 
@@ -91,135 +31,18 @@ prom_cif_interface:
 prom_cif_callback:
 	sethi	%hi(p1275buf), %o1
 	or	%o1, %lo(p1275buf), %o1
-	save	%sp, -0x270, %sp
-	rdpr	%pstate, %l4
-	wrpr	%g0, 0x15, %pstate	! save PROM alternate globals
-	stx	%g1, [%sp + 2047 + 0x0b0]
-	stx	%g2, [%sp + 2047 + 0x0b8]
-	stx	%g3, [%sp + 2047 + 0x0c0]
-	stx	%g4, [%sp + 2047 + 0x0c8]
-	stx	%g5, [%sp + 2047 + 0x0d0]
-	stx	%g6, [%sp + 2047 + 0x0d8]
-	stx	%g7, [%sp + 2047 + 0x0e0]
-					! restore Linux alternate globals
-	ldx	[%sp + 2047 + 0x190], %g1
-	ldx	[%sp + 2047 + 0x198], %g2
-	ldx	[%sp + 2047 + 0x1a0], %g3
-	ldx	[%sp + 2047 + 0x1a8], %g4
-	ldx	[%sp + 2047 + 0x1b0], %g5
-	ldx	[%sp + 2047 + 0x1b8], %g6
-	ldx	[%sp + 2047 + 0x1c0], %g7
-	wrpr	%g0, 0x814, %pstate	! save PROM interrupt globals
-	stx	%g1, [%sp + 2047 + 0x0e8]
-	stx	%g2, [%sp + 2047 + 0x0f0]
-	stx	%g3, [%sp + 2047 + 0x0f8]
-	stx	%g4, [%sp + 2047 + 0x100]
-	stx	%g5, [%sp + 2047 + 0x108]
-	stx	%g6, [%sp + 2047 + 0x110]
-	stx	%g7, [%sp + 2047 + 0x118]
-					! restore Linux interrupt globals
-	ldx	[%sp + 2047 + 0x1c8], %g1
-	ldx	[%sp + 2047 + 0x1d0], %g2
-	ldx	[%sp + 2047 + 0x1d8], %g3
-	ldx	[%sp + 2047 + 0x1e0], %g4
-	ldx	[%sp + 2047 + 0x1e8], %g5
-	ldx	[%sp + 2047 + 0x1f0], %g6
-	ldx	[%sp + 2047 + 0x1f8], %g7
-	wrpr	%g0, 0x14, %pstate	! save PROM normal globals
-	stx	%g1, [%sp + 2047 + 0x120]
-	stx	%g2, [%sp + 2047 + 0x128]
-	stx	%g3, [%sp + 2047 + 0x130]
-	stx	%g4, [%sp + 2047 + 0x138]
-	stx	%g5, [%sp + 2047 + 0x140]
-	stx	%g6, [%sp + 2047 + 0x148]
-	stx	%g7, [%sp + 2047 + 0x150]
-					! restore Linux normal globals
-	ldx	[%sp + 2047 + 0x200], %g1
-	ldx	[%sp + 2047 + 0x208], %g2
-	ldx	[%sp + 2047 + 0x210], %g3
-	ldx	[%sp + 2047 + 0x218], %g4
-	ldx	[%sp + 2047 + 0x220], %g5
-	ldx	[%sp + 2047 + 0x228], %g6
-	ldx	[%sp + 2047 + 0x230], %g7
-	wrpr	%g0, 0x414, %pstate	! save PROM mmu globals
-	stx	%g1, [%sp + 2047 + 0x158]
-	stx	%g2, [%sp + 2047 + 0x160]
-	stx	%g3, [%sp + 2047 + 0x168]
-	stx	%g4, [%sp + 2047 + 0x170]
-	stx	%g5, [%sp + 2047 + 0x178]
-	stx	%g6, [%sp + 2047 + 0x180]
-	stx	%g7, [%sp + 2047 + 0x188]
-					! restore Linux mmu globals
-	ldx	[%sp + 2047 + 0x238], %o0
-	ldx	[%sp + 2047 + 0x240], %o1
-	ldx	[%sp + 2047 + 0x248], %l2
-	ldx	[%sp + 2047 + 0x250], %l3
-	ldx	[%sp + 2047 + 0x258], %l5
-	ldx	[%sp + 2047 + 0x260], %l6
-	ldx	[%sp + 2047 + 0x268], %l7
-					! switch to Linux tba
-	sethi	%hi(sparc64_ttable_tl0), %l1
-	rdpr	%tba, %l0		! save PROM tba
-	mov	%o0, %g1
-	mov	%o1, %g2
-	mov	%l2, %g3
-	mov	%l3, %g4
-	mov	%l5, %g5
-	mov	%l6, %g6
-	mov	%l7, %g7
-	wrpr	%l1, %tba		! install Linux tba
-	wrpr	%l4, 0, %pstate		! restore PSTATE
+	save	%sp, -192, %sp
+	TRAP_LOAD_THREAD_REG(%g6, %g1)
+	LOAD_PER_CPU_BASE(%g5, %g6, %g4, %g3, %o0)
+	ldx	[%g6 + TI_TASK], %g4
 	call	prom_world
-	 mov	%g0, %o0
+	 mov	0, %o0
 	ldx	[%i1 + 0x000], %l2
 	call	%l2
 	 mov	%i0, %o0
 	mov	%o0, %l1
 	call	prom_world
-	 or	%g0, 1, %o0
-	wrpr	%g0, 0x14, %pstate	! interrupts off
-					! restore PROM mmu globals
-	ldx	[%sp + 2047 + 0x158], %o0
-	ldx	[%sp + 2047 + 0x160], %o1
-	ldx	[%sp + 2047 + 0x168], %l2
-	ldx	[%sp + 2047 + 0x170], %l3
-	ldx	[%sp + 2047 + 0x178], %l5
-	ldx	[%sp + 2047 + 0x180], %l6
-	ldx	[%sp + 2047 + 0x188], %l7
-	wrpr	%g0, 0x414, %pstate	! restore PROM mmu globals
-	mov	%o0, %g1
-	mov	%o1, %g2
-	mov	%l2, %g3
-	mov	%l3, %g4
-	mov	%l5, %g5
-	mov	%l6, %g6
-	mov	%l7, %g7
-	wrpr	%l0, %tba		! restore PROM tba
-	wrpr	%g0, 0x14, %pstate	! restore PROM normal globals
-	ldx	[%sp + 2047 + 0x120], %g1
-	ldx	[%sp + 2047 + 0x128], %g2
-	ldx	[%sp + 2047 + 0x130], %g3
-	ldx	[%sp + 2047 + 0x138], %g4
-	ldx	[%sp + 2047 + 0x140], %g5
-	ldx	[%sp + 2047 + 0x148], %g6
-	ldx	[%sp + 2047 + 0x150], %g7
-	wrpr	%g0, 0x814, %pstate	! restore PROM interrupt globals
-	ldx	[%sp + 2047 + 0x0e8], %g1
-	ldx	[%sp + 2047 + 0x0f0], %g2
-	ldx	[%sp + 2047 + 0x0f8], %g3
-	ldx	[%sp + 2047 + 0x100], %g4
-	ldx	[%sp + 2047 + 0x108], %g5
-	ldx	[%sp + 2047 + 0x110], %g6
-	ldx	[%sp + 2047 + 0x118], %g7
-	wrpr	%g0, 0x15, %pstate	! restore PROM alternate globals
-	ldx	[%sp + 2047 + 0x0b0], %g1
-	ldx	[%sp + 2047 + 0x0b8], %g2
-	ldx	[%sp + 2047 + 0x0c0], %g3
-	ldx	[%sp + 2047 + 0x0c8], %g4
-	ldx	[%sp + 2047 + 0x0d0], %g5
-	ldx	[%sp + 2047 + 0x0d8], %g6
-	ldx	[%sp + 2047 + 0x0e0], %g7
-	wrpr	%l4, 0, %pstate
+	 mov	1, %o0
 	ret
 	 restore %l1, 0, %o0
 
diff --git a/arch/sparc64/prom/console.c b/arch/sparc64/prom/console.c
index ac6d035dd150..7c25c54cefdc 100644
--- a/arch/sparc64/prom/console.c
+++ b/arch/sparc64/prom/console.c
@@ -102,6 +102,9 @@ prom_query_input_device(void)
 	if (!strncmp (propb, "rsc", 3))
 		return PROMDEV_IRSC;
 
+	if (!strncmp (propb, "virtual-console", 3))
+		return PROMDEV_IVCONS;
+
 	if (strncmp (propb, "tty", 3) || !propb[3])
 		return PROMDEV_I_UNK;
 
@@ -143,6 +146,9 @@ prom_query_output_device(void)
 	if (!strncmp (propb, "rsc", 3))
 		return PROMDEV_ORSC;
 
+	if (!strncmp (propb, "virtual-console", 3))
+		return PROMDEV_OVCONS;
+
 	if (strncmp (propb, "tty", 3) || !propb[3])
 		return PROMDEV_O_UNK;
 
diff --git a/arch/sparc64/prom/init.c b/arch/sparc64/prom/init.c
index f3cc2d8578b2..1c0db842a6f4 100644
--- a/arch/sparc64/prom/init.c
+++ b/arch/sparc64/prom/init.c
@@ -14,11 +14,10 @@
 #include <asm/openprom.h>
 #include <asm/oplib.h>
 
-enum prom_major_version prom_vers;
-unsigned int prom_rev, prom_prev;
+/* OBP version string. */
+char prom_version[80];
 
 /* The root node of the prom device tree. */
-int prom_root_node;
 int prom_stdin, prom_stdout;
 int prom_chosen_node;
 
@@ -31,68 +30,25 @@ extern void prom_cif_init(void *, void *);
 
 void __init prom_init(void *cif_handler, void *cif_stack)
 {
-	char buffer[80], *p;
-	int ints[3];
 	int node;
-	int i = 0;
-	int bufadjust;
-
-	prom_vers = PROM_P1275;
 
 	prom_cif_init(cif_handler, cif_stack);
 
-	prom_root_node = prom_getsibling(0);
-	if((prom_root_node == 0) || (prom_root_node == -1))
-		prom_halt();
-
 	prom_chosen_node = prom_finddevice(prom_chosen_path);
 	if (!prom_chosen_node || prom_chosen_node == -1)
 		prom_halt();
 
-	prom_stdin = prom_getint (prom_chosen_node, "stdin");
-	prom_stdout = prom_getint (prom_chosen_node, "stdout");
+	prom_stdin = prom_getint(prom_chosen_node, "stdin");
+	prom_stdout = prom_getint(prom_chosen_node, "stdout");
 
 	node = prom_finddevice("/openprom");
 	if (!node || node == -1)
 		prom_halt();
 
-	prom_getstring (node, "version", buffer, sizeof (buffer));
-
-	prom_printf ("\n");
-
-	if (strncmp (buffer, "OBP ", 4))
-		goto strange_version;
-
-	/*
-	 * Version field is expected to be 'OBP xx.yy.zz date...'
-	 * However, Sun can't stick to this format very well, so
-	 * we need to check for 'OBP  xx.yy.zz date...' and adjust
-	 * accordingly. -spot
-	 */
-
-	if (strncmp (buffer, "OBP  ", 5))
-		bufadjust = 4;
-	else
-		bufadjust = 5;
-
-	p = buffer + bufadjust;
-	while (p && isdigit(*p) && i < 3) {
-		ints[i++] = simple_strtoul(p, NULL, 0);
-		if ((p = strchr(p, '.')) != NULL)
-			p++;
-	}
-	if (i != 3)
-		goto strange_version;
-
-	prom_rev = ints[1];
-	prom_prev = (ints[0] << 16) | (ints[1] << 8) | ints[2];
-
-	printk ("PROMLIB: Sun IEEE Boot Prom %s\n", buffer + bufadjust);
+	prom_getstring(node, "version", prom_version, sizeof(prom_version));
 
-	/* Initialization successful. */
-	return;
+	prom_printf("\n");
 
-strange_version:
-	prom_printf ("Strange OBP version `%s'.\n", buffer);
-	prom_halt ();
+	printk("PROMLIB: Sun IEEE Boot Prom '%s'\n", prom_version);
+	printk("PROMLIB: Root node compatible: %s\n", prom_root_compatible);
 }
diff --git a/arch/sparc64/prom/misc.c b/arch/sparc64/prom/misc.c
index 87f5cfce23bb..577bde8b6647 100644
--- a/arch/sparc64/prom/misc.c
+++ b/arch/sparc64/prom/misc.c
@@ -112,28 +112,20 @@ unsigned char prom_get_idprom(char *idbuf, int num_bytes)
 	return 0xff;
 }
 
-/* Get the major prom version number. */
-int prom_version(void)
-{
-	return PROM_P1275;
-}
-
-/* Get the prom plugin-revision. */
-int prom_getrev(void)
-{
-	return prom_rev;
-}
-
-/* Get the prom firmware print revision. */
-int prom_getprev(void)
+/* Install Linux trap table so PROM uses that instead of its own. */
+void prom_set_trap_table(unsigned long tba)
 {
-	return prom_prev;
+	p1275_cmd("SUNW,set-trap-table",
+		  (P1275_ARG(0, P1275_ARG_IN_64B) |
+		   P1275_INOUT(1, 0)), tba);
 }
 
-/* Install Linux trap table so PROM uses that instead of its own. */
-void prom_set_trap_table(unsigned long tba)
+void prom_set_trap_table_sun4v(unsigned long tba, unsigned long mmfsa)
 {
-	p1275_cmd("SUNW,set-trap-table", P1275_INOUT(1, 0), tba);
+	p1275_cmd("SUNW,set-trap-table",
+		  (P1275_ARG(0, P1275_ARG_IN_64B) |
+		   P1275_ARG(1, P1275_ARG_IN_64B) |
+		   P1275_INOUT(2, 0)), tba, mmfsa);
 }
 
 int prom_get_mmu_ihandle(void)
@@ -303,9 +295,21 @@ int prom_wakeupsystem(void)
 }
 
 #ifdef CONFIG_SMP
-void prom_startcpu(int cpunode, unsigned long pc, unsigned long o0)
+void prom_startcpu(int cpunode, unsigned long pc, unsigned long arg)
+{
+	p1275_cmd("SUNW,start-cpu", P1275_INOUT(3, 0), cpunode, pc, arg);
+}
+
+void prom_startcpu_cpuid(int cpuid, unsigned long pc, unsigned long arg)
+{
+	p1275_cmd("SUNW,start-cpu-by-cpuid", P1275_INOUT(3, 0),
+		  cpuid, pc, arg);
+}
+
+void prom_stopcpu_cpuid(int cpuid)
 {
-	p1275_cmd("SUNW,start-cpu", P1275_INOUT(3, 0), cpunode, pc, o0);
+	p1275_cmd("SUNW,stop-cpu-by-cpuid", P1275_INOUT(1, 0),
+		  cpuid);
 }
 
 void prom_stopself(void)
diff --git a/arch/sparc64/prom/p1275.c b/arch/sparc64/prom/p1275.c
index a5a7c5712028..2b32c489860c 100644
--- a/arch/sparc64/prom/p1275.c
+++ b/arch/sparc64/prom/p1275.c
@@ -30,16 +30,6 @@ extern void prom_world(int);
 extern void prom_cif_interface(void);
 extern void prom_cif_callback(void);
 
-static inline unsigned long spitfire_get_primary_context(void)
-{
-	unsigned long ctx;
-
-	__asm__ __volatile__("ldxa	[%1] %2, %0"
-			     : "=r" (ctx)
-			     : "r" (PRIMARY_CONTEXT), "i" (ASI_DMMU));
-	return ctx;
-}
-
 /*
  * This provides SMP safety on the p1275buf. prom_callback() drops this lock
  * to allow recursuve acquisition.
@@ -55,7 +45,6 @@ long p1275_cmd(const char *service, long fmt, ...)
 	long attrs, x;
 	
 	p = p1275buf.prom_buffer;
-	BUG_ON((spitfire_get_primary_context() & CTX_NR_MASK) != 0);
 
 	spin_lock_irqsave(&prom_entry_lock, flags);
 
diff --git a/arch/sparc64/prom/tree.c b/arch/sparc64/prom/tree.c
index b1ff9e87dcc6..49075abd7cbc 100644
--- a/arch/sparc64/prom/tree.c
+++ b/arch/sparc64/prom/tree.c
@@ -51,7 +51,7 @@ prom_getparent(int node)
 __inline__ int
 __prom_getsibling(int node)
 {
-	return p1275_cmd ("peer", P1275_INOUT(1, 1), node);
+	return p1275_cmd(prom_peer_name, P1275_INOUT(1, 1), node);
 }
 
 __inline__ int
@@ -59,9 +59,12 @@ prom_getsibling(int node)
 {
 	int sibnode;
 
-	if(node == -1) return 0;
+	if (node == -1)
+		return 0;
 	sibnode = __prom_getsibling(node);
-	if(sibnode == -1) return 0;
+	if (sibnode == -1)
+		return 0;
+
 	return sibnode;
 }
 
diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
index 3ab4677395f2..5284996780a7 100644
--- a/arch/sparc64/solaris/misc.c
+++ b/arch/sparc64/solaris/misc.c
@@ -90,7 +90,7 @@ static u32 do_solaris_mmap(u32 addr, u32 len, u32 prot, u32 flags, u32 fd, u64 o
 	len = PAGE_ALIGN(len);
 	if(!(flags & MAP_FIXED))
 		addr = 0;
-	else if (len > 0xf0000000UL || addr > 0xf0000000UL - len)
+	else if (len > STACK_TOP32 || addr > STACK_TOP32 - len)
 		goto out_putf;
 	ret_type = flags & _MAP_NEW;
 	flags &= ~_MAP_NEW;
@@ -102,7 +102,7 @@ static u32 do_solaris_mmap(u32 addr, u32 len, u32 prot, u32 flags, u32 fd, u64 o
 			 (unsigned long) prot, (unsigned long) flags, off);
 	up_write(&current->mm->mmap_sem);
 	if(!ret_type)
-		retval = ((retval < 0xf0000000) ? 0 : retval);
+		retval = ((retval < STACK_TOP32) ? 0 : retval);
 	                        
 out_putf:
 	if (file)