12 files changed, 1440 insertions, 345 deletions
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 11850f310fb4..2963e4dd0b80 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -110,6 +110,20 @@ config KVM_BOOK3S_64_PR
 	  processor, including emulating 32-bit processors on a 64-bit
 	  host.
 
+config KVM_BOOK3S_HV_EXIT_TIMING
+	bool "Detailed timing for hypervisor real-mode code"
+	depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS
+	---help---
+	  Calculate time taken for each vcpu in the real-mode guest entry,
+	  exit, and interrupt handling code, plus time spent in the guest
+	  and in nap mode due to idle (cede) while other threads are still
+	  in the guest.  The total, minimum and maximum times in nanoseconds
+	  together with the number of executions are reported in debugfs in
+	  kvm/vm#/vcpu#/timings.  The overhead is of the order of 30 - 40
+	  ns per exit on POWER8.
+
+	  If unsure, say N.
+
 config KVM_BOOKE_HV
 	bool
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index cfbcdc654201..453a8a47a467 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -821,6 +821,82 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 #endif
 }
 
+int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	u64 buf;
+	int ret;
+
+	if (!is_power_of_2(size) || (size > sizeof(buf)))
+		return H_TOO_HARD;
+
+	ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	switch (size) {
+	case 1:
+		kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf);
+		break;
+
+	case 2:
+		kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf));
+		break;
+
+	case 4:
+		kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf));
+		break;
+
+	case 8:
+		kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf));
+		break;
+
+	default:
+		BUG();
+	}
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load);
+
+int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	unsigned long val = kvmppc_get_gpr(vcpu, 6);
+	u64 buf;
+	int ret;
+
+	switch (size) {
+	case 1:
+		*(u8 *)&buf = val;
+		break;
+
+	case 2:
+		*(__be16 *)&buf = cpu_to_be16(val);
+		break;
+
+	case 4:
+		*(__be32 *)&buf = cpu_to_be32(val);
+		break;
+
+	case 8:
+		*(__be64 *)&buf = cpu_to_be64(val);
+		break;
+
+	default:
+		return H_TOO_HARD;
+	}
+
+	ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
+
 int kvmppc_core_check_processor_compat(void)
 {
 	/*
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 534acb3c6c3d..d6fe30835c58 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -27,6 +27,7 @@
 #include <linux/srcu.h>
 #include <linux/anon_inodes.h>
 #include <linux/file.h>
+#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -116,12 +117,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 	long order;
 
 	mutex_lock(&kvm->lock);
-	if (kvm->arch.rma_setup_done) {
-		kvm->arch.rma_setup_done = 0;
-		/* order rma_setup_done vs. vcpus_running */
+	if (kvm->arch.hpte_setup_done) {
+		kvm->arch.hpte_setup_done = 0;
+		/* order hpte_setup_done vs. vcpus_running */
 		smp_mb();
 		if (atomic_read(&kvm->arch.vcpus_running)) {
-			kvm->arch.rma_setup_done = 1;
+			kvm->arch.hpte_setup_done = 1;
 			goto out;
 		}
 	}
@@ -338,9 +339,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 	gr = kvm->arch.revmap[index].guest_rpte;
 
-	/* Unlock the HPTE */
-	asm volatile("lwsync" : : : "memory");
-	hptep[0] = cpu_to_be64(v);
+	unlock_hpte(hptep, v);
 	preempt_enable();
 
 	gpte->eaddr = eaddr;
@@ -469,8 +468,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 	hpte[1] = be64_to_cpu(hptep[1]);
 	hpte[2] = r = rev->guest_rpte;
-	asm volatile("lwsync" : : : "memory");
-	hptep[0] = cpu_to_be64(hpte[0]);
+	unlock_hpte(hptep, hpte[0]);
 	preempt_enable();
 
 	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@ -621,7 +619,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	hptep[1] = cpu_to_be64(r);
 	eieio();
-	hptep[0] = cpu_to_be64(hpte[0]);
+	__unlock_hpte(hptep, hpte[0]);
 	asm volatile("ptesync" : : : "memory");
 	preempt_enable();
 	if (page && hpte_is_writable(r))
@@ -642,7 +640,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return ret;
 
  out_unlock:
-	hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+	__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	preempt_enable();
 	goto out_put;
 }
@@ -771,7 +769,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			}
 		}
 		unlock_rmap(rmapp);
-		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	}
 	return 0;
 }
@@ -857,7 +855,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			}
 			ret = 1;
 		}
-		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	} while ((i = j) != head);
 
 	unlock_rmap(rmapp);
@@ -974,8 +972,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 
 		/* Now check and modify the HPTE */
 		if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
-			/* unlock and continue */
-			hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+			__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 			continue;
 		}
 
@@ -996,9 +993,9 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 				npages_dirty = n;
 			eieio();
 		}
-		v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+		v &= ~HPTE_V_ABSENT;
 		v |= HPTE_V_VALID;
-		hptep[0] = cpu_to_be64(v);
+		__unlock_hpte(hptep, v);
 	} while ((i = j) != head);
 
 	unlock_rmap(rmapp);
@@ -1218,8 +1215,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 			r &= ~HPTE_GR_MODIFIED;
 			revp->guest_rpte = r;
 		}
-		asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-		hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		unlock_hpte(hptp, be64_to_cpu(hptp[0]));
 		preempt_enable();
 		if (!(valid == want_valid && (first_pass || dirty)))
 			ok = 0;
@@ -1339,20 +1335,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	unsigned long tmp[2];
 	ssize_t nb;
 	long int err, ret;
-	int rma_setup;
+	int hpte_setup;
 
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
 	/* lock out vcpus from running while we're doing this */
 	mutex_lock(&kvm->lock);
-	rma_setup = kvm->arch.rma_setup_done;
-	if (rma_setup) {
-		kvm->arch.rma_setup_done = 0;	/* temporarily */
-		/* order rma_setup_done vs. vcpus_running */
+	hpte_setup = kvm->arch.hpte_setup_done;
+	if (hpte_setup) {
+		kvm->arch.hpte_setup_done = 0;	/* temporarily */
+		/* order hpte_setup_done vs. vcpus_running */
 		smp_mb();
 		if (atomic_read(&kvm->arch.vcpus_running)) {
-			kvm->arch.rma_setup_done = 1;
+			kvm->arch.hpte_setup_done = 1;
 			mutex_unlock(&kvm->lock);
 			return -EBUSY;
 		}
@@ -1405,7 +1401,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 				       "r=%lx\n", ret, i, v, r);
 				goto out;
 			}
-			if (!rma_setup && is_vrma_hpte(v)) {
+			if (!hpte_setup && is_vrma_hpte(v)) {
 				unsigned long psize = hpte_base_page_size(v, r);
 				unsigned long senc = slb_pgsize_encoding(psize);
 				unsigned long lpcr;
@@ -1414,7 +1410,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 					(VRMA_VSID << SLB_VSID_SHIFT_1T);
 				lpcr = senc << (LPCR_VRMASD_SH - 4);
 				kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
-				rma_setup = 1;
+				hpte_setup = 1;
 			}
 			++i;
 			hptp += 2;
@@ -1430,9 +1426,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	}
 
  out:
-	/* Order HPTE updates vs. rma_setup_done */
+	/* Order HPTE updates vs. hpte_setup_done */
 	smp_wmb();
-	kvm->arch.rma_setup_done = rma_setup;
+	kvm->arch.hpte_setup_done = hpte_setup;
 	mutex_unlock(&kvm->lock);
 
 	if (err)
@@ -1495,6 +1491,141 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
 	return ret;
 }
 
+struct debugfs_htab_state {
+	struct kvm	*kvm;
+	struct mutex	mutex;
+	unsigned long	hpt_index;
+	int		chars_left;
+	int		buf_index;
+	char		buf[64];
+};
+
+static int debugfs_htab_open(struct inode *inode, struct file *file)
+{
+	struct kvm *kvm = inode->i_private;
+	struct debugfs_htab_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(kvm);
+	p->kvm = kvm;
+	mutex_init(&p->mutex);
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_htab_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_htab_state *p = file->private_data;
+
+	kvm_put_kvm(p->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
+				 size_t len, loff_t *ppos)
+{
+	struct debugfs_htab_state *p = file->private_data;
+	ssize_t ret, r;
+	unsigned long i, n;
+	unsigned long v, hr, gr;
+	struct kvm *kvm;
+	__be64 *hptp;
+
+	ret = mutex_lock_interruptible(&p->mutex);
+	if (ret)
+		return ret;
+
+	if (p->chars_left) {
+		n = p->chars_left;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf + p->buf_index, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index += n;
+		buf += n;
+		len -= n;
+		ret = n;
+		if (r) {
+			if (!n)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	kvm = p->kvm;
+	i = p->hpt_index;
+	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+	for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		/* lock the HPTE so it's stable and read it */
+		preempt_disable();
+		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+			cpu_relax();
+		v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
+		hr = be64_to_cpu(hptp[1]);
+		gr = kvm->arch.revmap[i].guest_rpte;
+		unlock_hpte(hptp, v);
+		preempt_enable();
+
+		if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		n = scnprintf(p->buf, sizeof(p->buf),
+			      "%6lx %.16lx %.16lx %.16lx\n",
+			      i, v, hr, gr);
+		p->chars_left = n;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index = n;
+		buf += n;
+		len -= n;
+		ret += n;
+		if (r) {
+			if (!ret)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+	p->hpt_index = i;
+
+ out:
+	mutex_unlock(&p->mutex);
+	return ret;
+}
+
+ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_htab_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_htab_open,
+	.release = debugfs_htab_release,
+	.read	 = debugfs_htab_read,
+	.write	 = debugfs_htab_write,
+	.llseek	 = generic_file_llseek,
+};
+
+void kvmppc_mmu_debugfs_init(struct kvm *kvm)
+{
+	kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
+						    kvm->arch.debugfs_dir, kvm,
+						    &debugfs_htab_fops);
+}
+
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index de747563d29d..48d3c5d2ecc9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -32,6 +32,7 @@
 #include <linux/page-flags.h>
 #include <linux/srcu.h>
 #include <linux/miscdevice.h>
+#include <linux/debugfs.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -50,6 +51,7 @@
 #include <asm/hvcall.h>
 #include <asm/switch_to.h>
 #include <asm/smp.h>
+#include <asm/dbell.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -83,9 +85,35 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+static bool kvmppc_ipi_thread(int cpu)
+{
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		preempt_disable();
+		if (cpu_first_thread_sibling(cpu) ==
+		    cpu_first_thread_sibling(smp_processor_id())) {
+			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+			msg |= cpu_thread_in_core(cpu);
+			smp_mb();
+			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+			preempt_enable();
+			return true;
+		}
+		preempt_enable();
+	}
+
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
+	if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
+		xics_wake_cpu(cpu);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
-	int me;
 	int cpu = vcpu->cpu;
 	wait_queue_head_t *wqp;
 
@@ -95,20 +123,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 		++vcpu->stat.halt_wakeup;
 	}
 
-	me = get_cpu();
+	if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+		return;
 
 	/* CPU points to the first thread of the core */
-	if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_PPC_ICP_NATIVE
-		int real_cpu = cpu + vcpu->arch.ptid;
-		if (paca[real_cpu].kvm_hstate.xics_phys)
-			xics_wake_cpu(real_cpu);
-		else
-#endif
-		if (cpu_online(cpu))
-			smp_send_reschedule(cpu);
-	}
-	put_cpu();
+	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
+		smp_send_reschedule(cpu);
 }
 
 /*
@@ -706,6 +726,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
 		/* Send the error out to userspace via KVM_RUN */
 		return rc;
+	case H_LOGICAL_CI_LOAD:
+		ret = kvmppc_h_logical_ci_load(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_LOGICAL_CI_STORE:
+		ret = kvmppc_h_logical_ci_store(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
 	case H_SET_MODE:
 		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
 					kvmppc_get_gpr(vcpu, 5),
@@ -740,6 +770,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
 	case H_CONFER:
 	case H_REGISTER_VPA:
 	case H_SET_MODE:
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
 #ifdef CONFIG_KVM_XICS
 	case H_XIRR:
 	case H_CPPR:
@@ -1410,6 +1442,154 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	return vcore;
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static struct debugfs_timings_element {
+	const char *name;
+	size_t offset;
+} timings[] = {
+	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
+	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
+	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
+	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
+	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
+};
+
+#define N_TIMINGS	(sizeof(timings) / sizeof(timings[0]))
+
+struct debugfs_timings_state {
+	struct kvm_vcpu	*vcpu;
+	unsigned int	buflen;
+	char		buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+	struct kvm_vcpu *vcpu = inode->i_private;
+	struct debugfs_timings_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(vcpu->kvm);
+	p->vcpu = vcpu;
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_timings_state *p = file->private_data;
+
+	kvm_put_kvm(p->vcpu->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos)
+{
+	struct debugfs_timings_state *p = file->private_data;
+	struct kvm_vcpu *vcpu = p->vcpu;
+	char *s, *buf_end;
+	struct kvmhv_tb_accumulator tb;
+	u64 count;
+	loff_t pos;
+	ssize_t n;
+	int i, loops;
+	bool ok;
+
+	if (!p->buflen) {
+		s = p->buf;
+		buf_end = s + sizeof(p->buf);
+		for (i = 0; i < N_TIMINGS; ++i) {
+			struct kvmhv_tb_accumulator *acc;
+
+			acc = (struct kvmhv_tb_accumulator *)
+				((unsigned long)vcpu + timings[i].offset);
+			ok = false;
+			for (loops = 0; loops < 1000; ++loops) {
+				count = acc->seqcount;
+				if (!(count & 1)) {
+					smp_rmb();
+					tb = *acc;
+					smp_rmb();
+					if (count == acc->seqcount) {
+						ok = true;
+						break;
+					}
+				}
+				udelay(1);
+			}
+			if (!ok)
+				snprintf(s, buf_end - s, "%s: stuck\n",
+					timings[i].name);
+			else
+				snprintf(s, buf_end - s,
+					"%s: %llu %llu %llu %llu\n",
+					timings[i].name, count / 2,
+					tb_to_ns(tb.tb_total),
+					tb_to_ns(tb.tb_min),
+					tb_to_ns(tb.tb_max));
+			s += strlen(s);
+		}
+		p->buflen = s - p->buf;
+	}
+
+	pos = *ppos;
+	if (pos >= p->buflen)
+		return 0;
+	if (len > p->buflen - pos)
+		len = p->buflen - pos;
+	n = copy_to_user(buf, p->buf + pos, len);
+	if (n) {
+		if (n == len)
+			return -EFAULT;
+		len -= n;
+	}
+	*ppos = pos + len;
+	return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+				     size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_timings_open,
+	.release = debugfs_timings_release,
+	.read	 = debugfs_timings_read,
+	.write	 = debugfs_timings_write,
+	.llseek	 = generic_file_llseek,
+};
+
+/* Create a debugfs directory for the vcpu */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+	char buf[16];
+	struct kvm *kvm = vcpu->kvm;
+
+	snprintf(buf, sizeof(buf), "vcpu%u", id);
+	if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+		return;
+	vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
+	if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
+		return;
+	vcpu->arch.debugfs_timings =
+		debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
+				    vcpu, &debugfs_timings_ops);
+}
+
+#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+}
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+
 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 						   unsigned int id)
 {
@@ -1479,6 +1659,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
 
+	debugfs_vcpu_init(vcpu, id);
+
 	return vcpu;
 
 free_vcpu:
@@ -1566,8 +1748,10 @@ static int kvmppc_grab_hwthread(int cpu)
 	tpaca = &paca[cpu];
 
 	/* Ensure the thread won't go into the kernel if it wakes */
-	tpaca->kvm_hstate.hwthread_req = 1;
 	tpaca->kvm_hstate.kvm_vcpu = NULL;
+	tpaca->kvm_hstate.napping = 0;
+	smp_wmb();
+	tpaca->kvm_hstate.hwthread_req = 1;
 
 	/*
 	 * If the thread is already executing in the kernel (e.g. handling
@@ -1610,35 +1794,41 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	}
 	cpu = vc->pcpu + vcpu->arch.ptid;
 	tpaca = &paca[cpu];
-	tpaca->kvm_hstate.kvm_vcpu = vcpu;
 	tpaca->kvm_hstate.kvm_vcore = vc;
 	tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
 	vcpu->cpu = vc->pcpu;
+	/* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
 	smp_wmb();
-#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-	if (cpu != smp_processor_id()) {
-		xics_wake_cpu(cpu);
-		if (vcpu->arch.ptid)
-			++vc->n_woken;
-	}
-#endif
+	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	if (cpu != smp_processor_id())
+		kvmppc_ipi_thread(cpu);
 }
 
-static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+static void kvmppc_wait_for_nap(void)
 {
-	int i;
+	int cpu = smp_processor_id();
+	int i, loops;
 
-	HMT_low();
-	i = 0;
-	while (vc->nap_count < vc->n_woken) {
-		if (++i >= 1000000) {
-			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
-			       vc->nap_count, vc->n_woken);
-			break;
+	for (loops = 0; loops < 1000000; ++loops) {
+		/*
+		 * Check if all threads are finished.
+		 * We set the vcpu pointer when starting a thread
+		 * and the thread clears it when finished, so we look
+		 * for any threads that still have a non-NULL vcpu ptr.
+		 */
+		for (i = 1; i < threads_per_subcore; ++i)
+			if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+				break;
+		if (i == threads_per_subcore) {
+			HMT_medium();
+			return;
 		}
-		cpu_relax();
+		HMT_low();
 	}
 	HMT_medium();
+	for (i = 1; i < threads_per_subcore; ++i)
+		if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 
 /*
@@ -1700,63 +1890,103 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
 	mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 }
 
+static void prepare_threads(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu, *vnext;
+
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		if (signal_pending(vcpu->arch.run_task))
+			vcpu->arch.ret = -EINTR;
+		else if (vcpu->arch.vpa.update_pending ||
+			 vcpu->arch.slb_shadow.update_pending ||
+			 vcpu->arch.dtl.update_pending)
+			vcpu->arch.ret = RESUME_GUEST;
+		else
+			continue;
+		kvmppc_remove_runnable(vc, vcpu);
+		wake_up(&vcpu->arch.cpu_run);
+	}
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc)
+{
+	u64 now;
+	long ret;
+	struct kvm_vcpu *vcpu, *vnext;
+
+	now = get_tb();
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		/* cancel pending dec exception if dec is positive */
+		if (now < vcpu->arch.dec_expires &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+
+		trace_kvm_guest_exit(vcpu);
+
+		ret = RESUME_GUEST;
+		if (vcpu->arch.trap)
+			ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+						    vcpu->arch.run_task);
+
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+
+		if (vcpu->arch.ceded) {
+			if (!is_kvmppc_resume_guest(ret))
+				kvmppc_end_cede(vcpu);
+			else
+				kvmppc_set_timer(vcpu);
+		}
+		if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+}
+
 /*
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
  */
-static void kvmppc_run_core(struct kvmppc_vcore *vc)
+static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vnext;
-	long ret;
-	u64 now;
-	int i, need_vpa_update;
+	struct kvm_vcpu *vcpu;
+	int i;
 	int srcu_idx;
-	struct kvm_vcpu *vcpus_to_update[threads_per_core];
 
-	/* don't start if any threads have a signal pending */
-	need_vpa_update = 0;
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		if (signal_pending(vcpu->arch.run_task))
-			return;
-		if (vcpu->arch.vpa.update_pending ||
-		    vcpu->arch.slb_shadow.update_pending ||
-		    vcpu->arch.dtl.update_pending)
-			vcpus_to_update[need_vpa_update++] = vcpu;
-	}
+	/*
+	 * Remove from the list any threads that have a signal pending
+	 * or need a VPA update done
+	 */
+	prepare_threads(vc);
+
+	/* if the runner is no longer runnable, let the caller pick a new one */
+	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
 
 	/*
-	 * Initialize *vc, in particular vc->vcore_state, so we can
-	 * drop the vcore lock if necessary.
+	 * Initialize *vc.
 	 */
-	vc->n_woken = 0;
-	vc->nap_count = 0;
-	vc->entry_exit_count = 0;
+	vc->entry_exit_map = 0;
 	vc->preempt_tb = TB_NIL;
-	vc->vcore_state = VCORE_STARTING;
 	vc->in_guest = 0;
 	vc->napping_threads = 0;
 	vc->conferring_threads = 0;
 
 	/*
-	 * Updating any of the vpas requires calling kvmppc_pin_guest_page,
-	 * which can't be called with any spinlocks held.
-	 */
-	if (need_vpa_update) {
-		spin_unlock(&vc->lock);
-		for (i = 0; i < need_vpa_update; ++i)
-			kvmppc_update_vpas(vcpus_to_update[i]);
-		spin_lock(&vc->lock);
-	}
-
-	/*
 	 * Make sure we are running on primary threads, and that secondary
 	 * threads are offline.  Also check if the number of threads in this
 	 * guest are greater than the current system threads per guest.
 	 */
 	if ((threads_per_core > 1) &&
 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 			vcpu->arch.ret = -EBUSY;
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
 		goto out;
 	}
 
@@ -1797,8 +2027,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
 		vcpu->cpu = -1;
 	/* wait for secondary threads to finish writing their state to memory */
-	if (vc->nap_count < vc->n_woken)
-		kvmppc_wait_for_nap(vc);
+	kvmppc_wait_for_nap();
 	for (i = 0; i < threads_per_subcore; ++i)
 		kvmppc_release_hwthread(vc->pcpu + i);
 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
@@ -1812,44 +2041,12 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	kvm_guest_exit();
 
 	preempt_enable();
-	cond_resched();
 
 	spin_lock(&vc->lock);
-	now = get_tb();
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		/* cancel pending dec exception if dec is positive */
-		if (now < vcpu->arch.dec_expires &&
-		    kvmppc_core_pending_dec(vcpu))
-			kvmppc_core_dequeue_dec(vcpu);
-
-		trace_kvm_guest_exit(vcpu);
-
-		ret = RESUME_GUEST;
-		if (vcpu->arch.trap)
-			ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
-						    vcpu->arch.run_task);
-
-		vcpu->arch.ret = ret;
-		vcpu->arch.trap = 0;
-
-		if (vcpu->arch.ceded) {
-			if (!is_kvmppc_resume_guest(ret))
-				kvmppc_end_cede(vcpu);
-			else
-				kvmppc_set_timer(vcpu);
-		}
-	}
+	post_guest_process(vc);
 
  out:
 	vc->vcore_state = VCORE_INACTIVE;
-	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-				 arch.run_list) {
-		if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
-			kvmppc_remove_runnable(vc, vcpu);
-			wake_up(&vcpu->arch.cpu_run);
-		}
-	}
-
 	trace_kvmppc_run_core(vc, 1);
 }
 
@@ -1939,8 +2136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * this thread straight away and have it join in.
 	 */
 	if (!signal_pending(current)) {
-		if (vc->vcore_state == VCORE_RUNNING &&
-		    VCORE_EXIT_COUNT(vc) == 0) {
+		if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
 			trace_kvm_guest_enter(vcpu);
@@ -1971,7 +2167,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		}
 		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 			break;
-		vc->runner = vcpu;
 		n_ceded = 0;
 		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
 			if (!v->arch.pending_exceptions)
@@ -1979,10 +2174,17 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			else
 				v->arch.ceded = 0;
 		}
-		if (n_ceded == vc->n_runnable)
+		vc->runner = vcpu;
+		if (n_ceded == vc->n_runnable) {
 			kvmppc_vcore_blocked(vc);
-		else
+		} else if (should_resched()) {
+			vc->vcore_state = VCORE_PREEMPT;
+			/* Let something else run */
+			cond_resched_lock(&vc->lock);
+			vc->vcore_state = VCORE_INACTIVE;
+		} else {
 			kvmppc_run_core(vc);
+		}
 		vc->runner = NULL;
 	}
 
@@ -2032,11 +2234,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	atomic_inc(&vcpu->kvm->arch.vcpus_running);
-	/* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+	/* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
 	smp_mb();
 
 	/* On the first time here, set up HTAB and VRMA */
-	if (!vcpu->kvm->arch.rma_setup_done) {
+	if (!vcpu->kvm->arch.hpte_setup_done) {
 		r = kvmppc_hv_setup_htab_rma(vcpu);
 		if (r)
 			goto out;
@@ -2238,7 +2440,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	int srcu_idx;
 
 	mutex_lock(&kvm->lock);
-	if (kvm->arch.rma_setup_done)
+	if (kvm->arch.hpte_setup_done)
 		goto out;	/* another vcpu beat us to it */
 
 	/* Allocate hashed page table (if not done already) and reset it */
@@ -2289,9 +2491,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 
 	kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
 
-	/* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+	/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
 	smp_wmb();
-	kvm->arch.rma_setup_done = 1;
+	kvm->arch.hpte_setup_done = 1;
 	err = 0;
  out_srcu:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -2307,6 +2509,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
 	unsigned long lpcr, lpid;
+	char buf[32];
 
 	/* Allocate the guest's logical partition ID */
 
@@ -2347,6 +2550,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	 */
 	kvm_hv_vm_activated();
 
+	/*
+	 * Create a debugfs directory for the VM
+	 */
+	snprintf(buf, sizeof(buf), "vm%d", current->pid);
+	kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
+	if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+		kvmppc_mmu_debugfs_init(kvm);
+
 	return 0;
 }
 
@@ -2367,6 +2578,8 @@ static void kvmppc_free_vcores(struct kvm *kvm)
 
 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
+	debugfs_remove_recursive(kvm->arch.debugfs_dir);
+
 	kvm_hv_vm_deactivated();
 
 	kvmppc_free_vcores(kvm);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 1f083ff8a61a..ed2589d4593f 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -21,6 +21,10 @@
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
+#include <asm/archrandom.h>
+#include <asm/xics.h>
+#include <asm/dbell.h>
+#include <asm/cputhreads.h>
 
 #define KVM_CMA_CHUNK_ORDER	18
 
@@ -114,11 +118,11 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
 	int rv = H_SUCCESS; /* => don't yield */
 
 	set_bit(vcpu->arch.ptid, &vc->conferring_threads);
-	while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) {
-		threads_running = VCORE_ENTRY_COUNT(vc);
-		threads_ceded = hweight32(vc->napping_threads);
-		threads_conferring = hweight32(vc->conferring_threads);
-		if (threads_ceded + threads_conferring >= threads_running) {
+	while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
+		threads_running = VCORE_ENTRY_MAP(vc);
+		threads_ceded = vc->napping_threads;
+		threads_conferring = vc->conferring_threads;
+		if ((threads_ceded | threads_conferring) == threads_running) {
 			rv = H_TOO_HARD; /* => do yield */
 			break;
 		}
@@ -169,3 +173,89 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+	return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+	if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+		return H_SUCCESS;
+
+	return H_HARDWARE;
+}
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+	__asm__ __volatile__("stbcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+/*
+ * Send an interrupt or message to another CPU.
+ * This can only be called in real mode.
+ * The caller needs to include any barrier needed to order writes
+ * to memory vs. the IPI/message.
+ */
+void kvmhv_rm_send_ipi(int cpu)
+{
+	unsigned long xics_phys;
+
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+	    cpu_first_thread_sibling(cpu) ==
+	    cpu_first_thread_sibling(raw_smp_processor_id())) {
+		unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+		msg |= cpu_thread_in_core(cpu);
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return;
+	}
+
+	/* Else poke the target with an IPI */
+	xics_phys = paca[cpu].kvm_hstate.xics_phys;
+	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+/*
+ * The following functions are called from the assembly code
+ * in book3s_hv_rmhandlers.S.
+ */
+static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
+{
+	int cpu = vc->pcpu;
+
+	/* Order setting of exit map vs. msgsnd/IPI */
+	smp_mb();
+	for (; active; active >>= 1, ++cpu)
+		if (active & 1)
+			kvmhv_rm_send_ipi(cpu);
+}
+
+void kvmhv_commence_exit(int trap)
+{
+	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+	int ptid = local_paca->kvm_hstate.ptid;
+	int me, ee;
+
+	/* Set our bit in the threads-exiting-guest map in the 0xff00
+	   bits of vcore->entry_exit_map */
+	me = 0x100 << ptid;
+	do {
+		ee = vc->entry_exit_map;
+	} while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);
+
+	/* Are we the first here? */
+	if ((ee >> 8) != 0)
+		return;
+
+	/*
+	 * Trigger the other threads in this vcore to exit the guest.
+	 * If this is a hypervisor decrementer interrupt then they
+	 * will be already on their way out of the guest.
+	 */
+	if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
+		kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 625407e4d3b0..f6bf0b1de6d7 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -150,12 +150,6 @@ static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
 	return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 }
 
-static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
-{
-	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-	hpte[0] = cpu_to_be64(hpte_v);
-}
-
 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 		       long pte_index, unsigned long pteh, unsigned long ptel,
 		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
@@ -271,10 +265,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 				u64 pte;
 				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 					cpu_relax();
-				pte = be64_to_cpu(*hpte);
+				pte = be64_to_cpu(hpte[0]);
 				if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
 					break;
-				*hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+				__unlock_hpte(hpte, pte);
 				hpte += 2;
 			}
 			if (i == 8)
@@ -290,9 +284,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 				cpu_relax();
-			pte = be64_to_cpu(*hpte);
+			pte = be64_to_cpu(hpte[0]);
 			if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
-				*hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+				__unlock_hpte(hpte, pte);
 				return H_PTEG_FULL;
 			}
 		}
@@ -331,7 +325,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
 	eieio();
-	hpte[0] = cpu_to_be64(pteh);
+	__unlock_hpte(hpte, pteh);
 	asm volatile("ptesync" : : : "memory");
 
 	*pte_idx_ret = pte_index;
@@ -412,7 +406,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
 	    ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
-		hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hpte, pte);
 		return H_NOT_FOUND;
 	}
 
@@ -548,7 +542,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 				be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
 			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 			args[j] |= rcbits << (56 - 5);
-			hp[0] = 0;
+			__unlock_hpte(hp, 0);
 		}
 	}
 
@@ -574,7 +568,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	pte = be64_to_cpu(hpte[0]);
 	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
-		hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hpte, pte);
 		return H_NOT_FOUND;
 	}
 
@@ -755,8 +749,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 				/* Return with the HPTE still locked */
 				return (hash << 3) + (i >> 1);
 
-			/* Unlock and move on */
-			hpte[i] = cpu_to_be64(v);
+			__unlock_hpte(&hpte[i], v);
 		}
 
 		if (val & HPTE_V_SECONDARY)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 7c22997de906..00e45b6d4f24 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -23,17 +23,37 @@
 
 #define DEBUG_PASSUP
 
-static inline void rm_writeb(unsigned long paddr, u8 val)
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+/* -- ICS routines -- */
+static void ics_rm_check_resend(struct kvmppc_xics *xics,
+				struct kvmppc_ics *ics, struct kvmppc_icp *icp)
 {
-	__asm__ __volatile__("sync; stbcix %0,0,%1"
-		: : "r" (val), "r" (paddr) : "memory");
+	int i;
+
+	arch_spin_lock(&ics->lock);
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		arch_spin_unlock(&ics->lock);
+		icp_rm_deliver_irq(xics, icp, state->number);
+		arch_spin_lock(&ics->lock);
+	}
+
+	arch_spin_unlock(&ics->lock);
 }
 
+/* -- ICP routines -- */
+
 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 				struct kvm_vcpu *this_vcpu)
 {
 	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
-	unsigned long xics_phys;
 	int cpu;
 
 	/* Mark the target VCPU as having an interrupt pending */
@@ -56,9 +76,8 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 	/* In SMT cpu will always point to thread 0, we adjust it */
 	cpu += vcpu->arch.ptid;
 
-	/* Not too hard, then poke the target */
-	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+	smp_mb();
+	kvmhv_rm_send_ipi(cpu);
 }
 
 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
@@ -116,6 +135,180 @@ static inline int check_too_hard(struct kvmppc_xics *xics,
 	return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
 }
 
+static void icp_rm_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_rm_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	return success;
+}
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		/* Unsafe increment, but this does not need to be accurate */
+		xics->err_noics++;
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	arch_spin_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->server_num) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			/* Unsafe increment again*/
+			xics->err_noicp++;
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differentiate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * ics spin lock.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_rm_try_to_deliver() the target
+	 * processor may well have already consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			arch_spin_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->icsid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_rm_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			arch_spin_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	arch_spin_unlock(&ics->lock);
+}
+
 static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			     u8 new_cppr)
 {
@@ -184,8 +377,8 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	 * separately here as well.
 	 */
 	if (resend) {
-		icp->rm_action |= XICS_RM_CHECK_RESEND;
-		icp->rm_resend_icp = icp;
+		icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
 	}
 }
 
@@ -300,16 +493,16 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 		}
 	} while (!icp_rm_try_update(icp, old_state, new_state));
 
-	/* Pass rejects to virtual mode */
+	/* Handle reject in real mode */
 	if (reject && reject != XICS_IPI) {
-		this_icp->rm_action |= XICS_RM_REJECT;
-		this_icp->rm_reject = reject;
+		this_icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject);
 	}
 
-	/* Pass resends to virtual mode */
+	/* Handle resends in real mode */
 	if (resend) {
-		this_icp->rm_action |= XICS_RM_CHECK_RESEND;
-		this_icp->rm_resend_icp = icp;
+		this_icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
 	}
 
 	return check_too_hard(xics, this_icp);
@@ -365,10 +558,13 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 
 	} while (!icp_rm_try_update(icp, old_state, new_state));
 
-	/* Pass rejects to virtual mode */
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_rm_deliver_irq).
+	 */
 	if (reject && reject != XICS_IPI) {
-		icp->rm_action |= XICS_RM_REJECT;
-		icp->rm_reject = reject;
+		icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject);
 	}
  bail:
 	return check_too_hard(xics, icp);
@@ -416,10 +612,10 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 		goto bail;
 	state = &ics->irq_state[src];
 
-	/* Still asserted, resend it, we make it look like a reject */
+	/* Still asserted, resend it */
 	if (state->asserted) {
-		icp->rm_action |= XICS_RM_REJECT;
-		icp->rm_reject = irq;
+		icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, irq);
 	}
 
 	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6cbf1630cb70..4d70df26c402 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -172,6 +172,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 kvmppc_primary_no_guest:
 	/* We handle this much like a ceded vcpu */
+	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+	mfspr	r3, SPRN_HDEC
+	mtspr	SPRN_DEC, r3
+	/*
+	 * Make sure the primary has finished the MMU switch.
+	 * We should never get here on a secondary thread, but
+	 * check it for robustness' sake.
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+65:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	65b
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
 	/* set our bit in napping_threads */
 	ld	r5, HSTATE_KVM_VCORE(r13)
 	lbz	r7, HSTATE_PTID(r13)
@@ -182,7 +198,7 @@ kvmppc_primary_no_guest:
 	or	r3, r3, r0
 	stwcx.	r3, 0, r6
 	bne	1b
-	/* order napping_threads update vs testing entry_exit_count */
+	/* order napping_threads update vs testing entry_exit_map */
 	isync
 	li	r12, 0
 	lwz	r7, VCORE_ENTRY_EXIT(r5)
@@ -191,6 +207,7 @@ kvmppc_primary_no_guest:
 	li	r3, NAPPING_NOVCPU
 	stb	r3, HSTATE_NAPPING(r13)
 
+	li	r3, 0		/* Don't wake on privileged (OS) doorbell */
 	b	kvm_do_nap
 
 kvm_novcpu_wakeup:
@@ -202,7 +219,7 @@ kvm_novcpu_wakeup:
 
 	/* check the wake reason */
 	bl	kvmppc_check_wake_reason
-	
+
 	/* see if any other thread is already exiting */
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
@@ -222,13 +239,37 @@ kvm_novcpu_wakeup:
 	cmpdi	r3, 0
 	bge	kvm_novcpu_exit
 
+	/* See if our timeslice has expired (HDEC is negative) */
+	mfspr	r0, SPRN_HDEC
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	cmpwi	r0, 0
+	blt	kvm_novcpu_exit
+
 	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
 	ld	r4, HSTATE_KVM_VCPU(r13)
 	cmpdi	r4, 0
-	bne	kvmppc_got_guest
+	beq	kvmppc_primary_no_guest
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+#endif
+	b	kvmppc_got_guest
 
 kvm_novcpu_exit:
-	b	hdec_soon
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	13f
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+13:	mr	r3, r12
+	stw	r12, 112-4(r1)
+	bl	kvmhv_commence_exit
+	nop
+	lwz	r12, 112-4(r1)
+	b	kvmhv_switch_to_host
 
 /*
  * We come in here when wakened from nap mode.
@@ -239,9 +280,9 @@ kvm_novcpu_exit:
 kvm_start_guest:
 
 	/* Set runlatch bit the minute you wake up from nap */
-	mfspr	r1, SPRN_CTRLF
-	ori 	r1, r1, 1
-	mtspr	SPRN_CTRLT, r1
+	mfspr	r0, SPRN_CTRLF
+	ori 	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
 
 	ld	r2,PACATOC(r13)
 
@@ -286,26 +327,21 @@ kvm_secondary_got_guest:
 	ld	r6, PACA_DSCR(r13)
 	std	r6, HSTATE_DSCR(r13)
 
+	/* Order load of vcore, ptid etc. after load of vcpu */
+	lwsync
 	bl	kvmppc_hv_entry
 
 	/* Back from the guest, go back to nap */
 	/* Clear our vcpu pointer so we don't come back in early */
 	li	r0, 0
-	std	r0, HSTATE_KVM_VCPU(r13)
 	/*
-	 * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing
-	 * the nap_count, because once the increment to nap_count is
-	 * visible we could be given another vcpu.
+	 * Once we clear HSTATE_KVM_VCPU(r13), the code in
+	 * kvmppc_run_core() is going to assume that all our vcpu
+	 * state is visible in memory.  This lwsync makes sure
+	 * that that is true.
 	 */
 	lwsync
-
-	/* increment the nap count and then go to nap mode */
-	ld	r4, HSTATE_KVM_VCORE(r13)
-	addi	r4, r4, VCORE_NAP_COUNT
-51:	lwarx	r3, 0, r4
-	addi	r3, r3, 1
-	stwcx.	r3, 0, r4
-	bne	51b
+	std	r0, HSTATE_KVM_VCPU(r13)
 
 /*
  * At this point we have finished executing in the guest.
@@ -376,6 +412,14 @@ kvmppc_hv_entry:
 	li	r6, KVM_GUEST_MODE_HOST_HV
 	stb	r6, HSTATE_IN_GUEST(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Store initial timestamp */
+	cmpdi	r4, 0
+	beq	1f
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+1:
+#endif
 	/* Clear out SLB */
 	li	r6,0
 	slbmte	r6,r6
@@ -387,21 +431,23 @@ kvmppc_hv_entry:
 	 * We don't have to lock against concurrent tlbies,
 	 * but we do have to coordinate across hardware threads.
 	 */
-	/* Increment entry count iff exit count is zero. */
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	addi	r9,r5,VCORE_ENTRY_EXIT
-21:	lwarx	r3,0,r9
-	cmpwi	r3,0x100		/* any threads starting to exit? */
+	/* Set bit in entry map iff exit map is zero. */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r7, 1
+	lbz	r6, HSTATE_PTID(r13)
+	sld	r7, r7, r6
+	addi	r9, r5, VCORE_ENTRY_EXIT
+21:	lwarx	r3, 0, r9
+	cmpwi	r3, 0x100		/* any threads starting to exit? */
 	bge	secondary_too_late	/* if so we're too late to the party */
-	addi	r3,r3,1
-	stwcx.	r3,0,r9
+	or	r3, r3, r7
+	stwcx.	r3, 0, r9
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
 	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
-	lbz	r6,HSTATE_PTID(r13)
 	cmpwi	r6,0
-	bne	20f
+	bne	10f
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
@@ -472,28 +518,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	li	r0,1
 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
-	b	10f
-
-	/* Secondary threads wait for primary to have done partition switch */
-20:	lbz	r0,VCORE_IN_GUEST(r5)
-	cmpwi	r0,0
-	beq	20b
-
-	/* Set LPCR and RMOR. */
-10:	ld	r8,VCORE_LPCR(r5)
-	mtspr	SPRN_LPCR,r8
-	ld	r8,KVM_RMOR(r9)
-	mtspr	SPRN_RMOR,r8
-	isync
-
-	/* Check if HDEC expires soon */
-	mfspr	r3,SPRN_HDEC
-	cmpwi	r3,512		/* 1 microsecond */
-	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	blt	hdec_soon
 
 	/* Do we have a guest vcpu to run? */
-	cmpdi	r4, 0
+10:	cmpdi	r4, 0
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
 
@@ -818,6 +845,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	clrrdi	r6,r6,1
 	mtspr	SPRN_CTRLT,r6
 4:
+	/* Secondary threads wait for primary to have done partition switch */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, HSTATE_PTID(r13)
+	cmpwi	r6, 0
+	beq	21f
+	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	bne	21f
+	HMT_LOW
+20:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	20b
+	HMT_MEDIUM
+21:
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3, SPRN_HDEC
+	cmpwi	r3, 512		/* 1 microsecond */
+	blt	hdec_soon
+
 	ld	r6, VCPU_CTR(r4)
 	lwz	r7, VCPU_XER(r4)
 
@@ -880,6 +931,12 @@ fast_guest_return:
 	li	r9, KVM_GUEST_MODE_GUEST_HV
 	stb	r9, HSTATE_IN_GUEST(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Accumulate timing */
+	addi	r3, r4, VCPU_TB_GUEST
+	bl	kvmhv_accumulate_time
+#endif
+
 	/* Enter guest */
 
 BEGIN_FTR_SECTION
@@ -917,6 +974,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	hrfid
 	b	.
 
+secondary_too_late:
+	li	r12, 0
+	cmpdi	r4, 0
+	beq	11f
+	stw	r12, VCPU_TRAP(r4)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+11:	b	kvmhv_switch_to_host
+
+hdec_soon:
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	stw	r12, VCPU_TRAP(r4)
+	mr	r9, r4
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+	b	guest_exit_cont
+
 /******************************************************************************
  *                                                                            *
  *                               Exit code                                    *
@@ -1002,6 +1080,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 	stw	r12,VCPU_TRAP(r9)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMINTR
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+	ld	r5, VCPU_GPR(R5)(r9)
+	ld	r6, VCPU_GPR(R6)(r9)
+	ld	r7, VCPU_GPR(R7)(r9)
+	ld	r8, VCPU_GPR(R8)(r9)
+#endif
+
 	/* Save HEIR (HV emulation assist reg) in emul_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
 	li	r3,KVM_INST_FETCH_FAILED
@@ -1028,34 +1116,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	bne	2f
 	mfspr	r3,SPRN_HDEC
 	cmpwi	r3,0
-	bge	ignore_hdec
+	mr	r4,r9
+	bge	fast_guest_return
 2:
 	/* See if this is an hcall we can handle in real mode */
 	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
 	beq	hcall_try_real_mode
 
+	/* Hypervisor doorbell - exit only if host IPI flag set */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	bne	3f
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	beq	4f
+	b	guest_exit_cont
+3:
 	/* External interrupt ? */
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
-	bne+	ext_interrupt_to_host
+	bne+	guest_exit_cont
 
 	/* External interrupt, first check for host_ipi. If this is
 	 * set, we know the host wants us out so let's do it now
 	 */
 	bl	kvmppc_read_intr
 	cmpdi	r3, 0
-	bgt	ext_interrupt_to_host
+	bgt	guest_exit_cont
 
 	/* Check if any CPU is heading out to the host, if so head out too */
-	ld	r5, HSTATE_KVM_VCORE(r13)
+4:	ld	r5, HSTATE_KVM_VCORE(r13)
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
-	bge	ext_interrupt_to_host
-
-	/* Return to guest after delivering any pending interrupt */
 	mr	r4, r9
-	b	deliver_guest_interrupt
-
-ext_interrupt_to_host:
+	blt	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	/* Save more register state  */
@@ -1065,7 +1156,7 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	stw	r7, VCPU_DSISR(r9)
 	/* don't overwrite fault_dar/fault_dsisr if HDSI */
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-	beq	6f
+	beq	mc_cont
 	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
@@ -1073,9 +1164,20 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	beq	machine_check_realmode
 mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMEXIT
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+#endif
+
+	/* Increment exit count, poke other threads to exit */
+	bl	kvmhv_commence_exit
+	nop
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	lwz	r12, VCPU_TRAP(r9)
 
 	/* Save guest CTRL register, set runlatch to 1 */
-6:	mfspr	r6,SPRN_CTRLF
+	mfspr	r6,SPRN_CTRLF
 	stw	r6,VCPU_CTRL(r9)
 	andi.	r0,r6,1
 	bne	4f
@@ -1417,68 +1519,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbia
 	ptesync
 
-hdec_soon:			/* r12 = trap, r13 = paca */
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do
 	 * have to coordinate the hardware threads.
 	 */
-	/* Increment the threads-exiting-guest count in the 0xff00
-	   bits of vcore->entry_exit_count */
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	addi	r6,r5,VCORE_ENTRY_EXIT
-41:	lwarx	r3,0,r6
-	addi	r0,r3,0x100
-	stwcx.	r0,0,r6
-	bne	41b
-	isync		/* order stwcx. vs. reading napping_threads */
-
-	/*
-	 * At this point we have an interrupt that we have to pass
-	 * up to the kernel or qemu; we can't handle it in real mode.
-	 * Thus we have to do a partition switch, so we have to
-	 * collect the other threads, if we are the first thread
-	 * to take an interrupt.  To do this, we set the HDEC to 0,
-	 * which causes an HDEC interrupt in all threads within 2ns
-	 * because the HDEC register is shared between all 4 threads.
-	 * However, we don't need to bother if this is an HDEC
-	 * interrupt, since the other threads will already be on their
-	 * way here in that case.
-	 */
-	cmpwi	r3,0x100	/* Are we the first here? */
-	bge	43f
-	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	beq	40f
-	li	r0,0
-	mtspr	SPRN_HDEC,r0
-40:
-	/*
-	 * Send an IPI to any napping threads, since an HDEC interrupt
-	 * doesn't wake CPUs up from nap.
-	 */
-	lwz	r3,VCORE_NAPPING_THREADS(r5)
-	lbz	r4,HSTATE_PTID(r13)
-	li	r0,1
-	sld	r0,r0,r4
-	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */
-	beq	43f
-	/* Order entry/exit update vs. IPIs */
-	sync
-	mulli	r4,r4,PACA_SIZE		/* get paca for thread 0 */
-	subf	r6,r4,r13
-42:	andi.	r0,r3,1
-	beq	44f
-	ld	r8,HSTATE_XICS_PHYS(r6)	/* get thread's XICS reg addr */
-	li	r0,IPI_PRIORITY
-	li	r7,XICS_MFRR
-	stbcix	r0,r7,r8		/* trigger the IPI */
-44:	srdi.	r3,r3,1
-	addi	r6,r6,PACA_SIZE
-	bne	42b
-
-secondary_too_late:
+kvmhv_switch_to_host:
 	/* Secondary threads wait for primary to do partition switch */
-43:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r5,HSTATE_KVM_VCORE(r13)
 	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
 	lbz	r3,HSTATE_PTID(r13)
 	cmpwi	r3,0
@@ -1562,6 +1610,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 1:	addi	r8,r8,16
 	.endr
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Finish timing, if we have a vcpu */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	li	r3, 0
+	beq	2f
+	bl	kvmhv_accumulate_time
+2:
+#endif
 	/* Unset guest mode */
 	li	r0, KVM_GUEST_MODE_NONE
 	stb	r0, HSTATE_IN_GUEST(r13)
@@ -1696,8 +1753,10 @@ kvmppc_hisi:
  * Returns to the guest if we handle it, or continues on up to
  * the kernel if we can't (i.e. if we don't have a handler for
  * it, or if the handler returns H_TOO_HARD).
+ *
+ * r5 - r8 contain hcall args,
+ * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca
  */
-	.globl	hcall_try_real_mode
 hcall_try_real_mode:
 	ld	r3,VCPU_GPR(R3)(r9)
 	andi.	r0,r11,MSR_PR
@@ -1839,13 +1898,124 @@ hcall_real_table:
 	.long	0		/* 0x12c */
 	.long	0		/* 0x130 */
 	.long	DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+	.long	0		/* 0x138 */
+	.long	0		/* 0x13c */
+	.long	0		/* 0x140 */
+	.long	0		/* 0x144 */
+	.long	0		/* 0x148 */
+	.long	0		/* 0x14c */
+	.long	0		/* 0x150 */
+	.long	0		/* 0x154 */
+	.long	0		/* 0x158 */
+	.long	0		/* 0x15c */
+	.long	0		/* 0x160 */
+	.long	0		/* 0x164 */
+	.long	0		/* 0x168 */
+	.long	0		/* 0x16c */
+	.long	0		/* 0x170 */
+	.long	0		/* 0x174 */
+	.long	0		/* 0x178 */
+	.long	0		/* 0x17c */
+	.long	0		/* 0x180 */
+	.long	0		/* 0x184 */
+	.long	0		/* 0x188 */
+	.long	0		/* 0x18c */
+	.long	0		/* 0x190 */
+	.long	0		/* 0x194 */
+	.long	0		/* 0x198 */
+	.long	0		/* 0x19c */
+	.long	0		/* 0x1a0 */
+	.long	0		/* 0x1a4 */
+	.long	0		/* 0x1a8 */
+	.long	0		/* 0x1ac */
+	.long	0		/* 0x1b0 */
+	.long	0		/* 0x1b4 */
+	.long	0		/* 0x1b8 */
+	.long	0		/* 0x1bc */
+	.long	0		/* 0x1c0 */
+	.long	0		/* 0x1c4 */
+	.long	0		/* 0x1c8 */
+	.long	0		/* 0x1cc */
+	.long	0		/* 0x1d0 */
+	.long	0		/* 0x1d4 */
+	.long	0		/* 0x1d8 */
+	.long	0		/* 0x1dc */
+	.long	0		/* 0x1e0 */
+	.long	0		/* 0x1e4 */
+	.long	0		/* 0x1e8 */
+	.long	0		/* 0x1ec */
+	.long	0		/* 0x1f0 */
+	.long	0		/* 0x1f4 */
+	.long	0		/* 0x1f8 */
+	.long	0		/* 0x1fc */
+	.long	0		/* 0x200 */
+	.long	0		/* 0x204 */
+	.long	0		/* 0x208 */
+	.long	0		/* 0x20c */
+	.long	0		/* 0x210 */
+	.long	0		/* 0x214 */
+	.long	0		/* 0x218 */
+	.long	0		/* 0x21c */
+	.long	0		/* 0x220 */
+	.long	0		/* 0x224 */
+	.long	0		/* 0x228 */
+	.long	0		/* 0x22c */
+	.long	0		/* 0x230 */
+	.long	0		/* 0x234 */
+	.long	0		/* 0x238 */
+	.long	0		/* 0x23c */
+	.long	0		/* 0x240 */
+	.long	0		/* 0x244 */
+	.long	0		/* 0x248 */
+	.long	0		/* 0x24c */
+	.long	0		/* 0x250 */
+	.long	0		/* 0x254 */
+	.long	0		/* 0x258 */
+	.long	0		/* 0x25c */
+	.long	0		/* 0x260 */
+	.long	0		/* 0x264 */
+	.long	0		/* 0x268 */
+	.long	0		/* 0x26c */
+	.long	0		/* 0x270 */
+	.long	0		/* 0x274 */
+	.long	0		/* 0x278 */
+	.long	0		/* 0x27c */
+	.long	0		/* 0x280 */
+	.long	0		/* 0x284 */
+	.long	0		/* 0x288 */
+	.long	0		/* 0x28c */
+	.long	0		/* 0x290 */
+	.long	0		/* 0x294 */
+	.long	0		/* 0x298 */
+	.long	0		/* 0x29c */
+	.long	0		/* 0x2a0 */
+	.long	0		/* 0x2a4 */
+	.long	0		/* 0x2a8 */
+	.long	0		/* 0x2ac */
+	.long	0		/* 0x2b0 */
+	.long	0		/* 0x2b4 */
+	.long	0		/* 0x2b8 */
+	.long	0		/* 0x2bc */
+	.long	0		/* 0x2c0 */
+	.long	0		/* 0x2c4 */
+	.long	0		/* 0x2c8 */
+	.long	0		/* 0x2cc */
+	.long	0		/* 0x2d0 */
+	.long	0		/* 0x2d4 */
+	.long	0		/* 0x2d8 */
+	.long	0		/* 0x2dc */
+	.long	0		/* 0x2e0 */
+	.long	0		/* 0x2e4 */
+	.long	0		/* 0x2e8 */
+	.long	0		/* 0x2ec */
+	.long	0		/* 0x2f0 */
+	.long	0		/* 0x2f4 */
+	.long	0		/* 0x2f8 */
+	.long	0		/* 0x2fc */
+	.long	DOTSYM(kvmppc_h_random) - hcall_real_table
 	.globl	hcall_real_table_end
 hcall_real_table_end:
 
-ignore_hdec:
-	mr	r4,r9
-	b	fast_guest_return
-
 _GLOBAL(kvmppc_h_set_xdabr)
 	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
 	beq	6f
@@ -1884,7 +2054,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	li	r3, 0
 	blr
 
-_GLOBAL(kvmppc_h_cede)
+_GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
 	ori	r11,r11,MSR_EE
 	std	r11,VCPU_MSR(r3)
 	li	r0,1
@@ -1893,8 +2063,8 @@ _GLOBAL(kvmppc_h_cede)
 	lbz	r5,VCPU_PRODDED(r3)
 	cmpwi	r5,0
 	bne	kvm_cede_prodded
-	li	r0,0		/* set trap to 0 to say hcall is handled */
-	stw	r0,VCPU_TRAP(r3)
+	li	r12,0		/* set trap to 0 to say hcall is handled */
+	stw	r12,VCPU_TRAP(r3)
 	li	r0,H_SUCCESS
 	std	r0,VCPU_GPR(R3)(r3)
 
@@ -1912,12 +2082,11 @@ _GLOBAL(kvmppc_h_cede)
 	addi	r6,r5,VCORE_NAPPING_THREADS
 31:	lwarx	r4,0,r6
 	or	r4,r4,r0
-	PPC_POPCNTW(R7,R4)
-	cmpw	r7,r8
-	bge	kvm_cede_exit
+	cmpw	r4,r8
+	beq	kvm_cede_exit
 	stwcx.	r4,0,r6
 	bne	31b
-	/* order napping_threads update vs testing entry_exit_count */
+	/* order napping_threads update vs testing entry_exit_map */
 	isync
 	li	r0,NAPPING_CEDE
 	stb	r0,HSTATE_NAPPING(r13)
@@ -1955,21 +2124,52 @@ _GLOBAL(kvmppc_h_cede)
 	bl	kvmppc_save_fp
 
 	/*
+	 * Set DEC to the smaller of DEC and HDEC, so that we wake
+	 * no later than the end of our timeslice (HDEC interrupts
+	 * don't wake us from nap).
+	 */
+	mfspr	r3, SPRN_DEC
+	mfspr	r4, SPRN_HDEC
+	mftb	r5
+	cmpw	r3, r4
+	ble	67f
+	mtspr	SPRN_DEC, r4
+67:
+	/* save expiry time of guest decrementer */
+	extsw	r3, r3
+	add	r3, r3, r5
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_TB_OFFSET(r5)
+	subf	r3, r6, r3	/* convert to host TB value */
+	std	r3, VCPU_DEC_EXPIRES(r4)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	addi	r3, r4, VCPU_TB_CEDE
+	bl	kvmhv_accumulate_time
+#endif
+
+	lis	r3, LPCR_PECEDP@h	/* Do wake on privileged doorbell */
+
+	/*
 	 * Take a nap until a decrementer or external or doobell interrupt
-	 * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the
-	 * runlatch bit before napping.
+	 * occurs, with PECE1 and PECE0 set in LPCR.
+	 * On POWER8, set PECEDH, and if we are ceding, also set PECEDP.
+	 * Also clear the runlatch bit before napping.
 	 */
 kvm_do_nap:
-	mfspr	r2, SPRN_CTRLF
-	clrrdi	r2, r2, 1
-	mtspr	SPRN_CTRLT, r2
+	mfspr	r0, SPRN_CTRLF
+	clrrdi	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
 
 	li	r0,1
 	stb	r0,HSTATE_HWTHREAD_REQ(r13)
 	mfspr	r5,SPRN_LPCR
 	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
 BEGIN_FTR_SECTION
-	oris	r5,r5,LPCR_PECEDP@h
+	ori	r5, r5, LPCR_PECEDH
+	rlwimi	r5, r3, 0, LPCR_PECEDP
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_LPCR,r5
 	isync
@@ -1994,9 +2194,23 @@ kvm_end_cede:
 	/* Woken by external or decrementer interrupt */
 	ld	r1, HSTATE_HOST_R1(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMINTR
+	bl	kvmhv_accumulate_time
+#endif
+
 	/* load up FP state */
 	bl	kvmppc_load_fp
 
+	/* Restore guest decrementer */
+	ld	r3, VCPU_DEC_EXPIRES(r4)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_TB_OFFSET(r5)
+	add	r3, r3, r6	/* convert host TB to guest TB value */
+	mftb	r7
+	subf	r3, r7, r3
+	mtspr	SPRN_DEC, r3
+
 	/* Load NV GPRS */
 	ld	r14, VCPU_GPR(R14)(r4)
 	ld	r15, VCPU_GPR(R15)(r4)
@@ -2057,7 +2271,8 @@ kvm_cede_prodded:
 
 	/* we've ceded but we want to give control to the host */
 kvm_cede_exit:
-	b	hcall_real_fallback
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	b	guest_exit_cont
 
 	/* Try to handle a machine check in real mode */
 machine_check_realmode:
@@ -2089,13 +2304,14 @@ machine_check_realmode:
 
 /*
  * Check the reason we woke from nap, and take appropriate action.
- * Returns:
+ * Returns (in r3):
  *	0 if nothing needs to be done
  *	1 if something happened that needs to be handled by the host
- *	-1 if there was a guest wakeup (IPI)
+ *	-1 if there was a guest wakeup (IPI or msgsnd)
  *
  * Also sets r12 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
+ * Modifies r0, r6, r7, r8.
  */
 kvmppc_check_wake_reason:
 	mfspr	r6, SPRN_SRR1
@@ -2122,7 +2338,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	/* hypervisor doorbell */
 3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	/* see if it's a host IPI */
 	li	r3, 1
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bnelr
+	/* if not, clear it and return -1 */
+	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
+	PPC_MSGCLR(6)
+	li	r3, -1
 	blr
 
 /*
@@ -2131,6 +2355,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  *	0 if no interrupt is pending
  *	1 if an interrupt is pending that needs to be handled by the host
  *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ * Modifies r0, r6, r7, r8, returns value in r3.
  */
 kvmppc_read_intr:
 	/* see if a host IPI is pending */
@@ -2185,6 +2410,7 @@ kvmppc_read_intr:
 	bne-	43f
 
 	/* OK, it's an IPI for us */
+	li	r12, 0
 	li	r3, -1
 1:	blr
 
@@ -2314,3 +2540,62 @@ kvmppc_fix_pmao:
 	mtspr	SPRN_PMC6, r3
 	isync
 	blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, VCORE_IN_GUEST(r5)
+	cmpwi	r6, 0
+	beq	5f				/* if in guest, need to */
+	ld	r6, VCORE_TB_OFFSET(r5)		/* subtract timebase offset */
+5:	mftb	r5
+	subf	r5, r6, r5
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	std	r5, VCPU_ACTIVITY_START(r4)
+	blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r8, VCORE_IN_GUEST(r5)
+	cmpwi	r8, 0
+	beq	4f				/* if in guest, need to */
+	ld	r8, VCORE_TB_OFFSET(r5)		/* subtract timebase offset */
+4:	ld	r5, VCPU_CUR_ACTIVITY(r4)
+	ld	r6, VCPU_ACTIVITY_START(r4)
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	mftb	r7
+	subf	r7, r8, r7
+	std	r7, VCPU_ACTIVITY_START(r4)
+	cmpdi	r5, 0
+	beqlr
+	subf	r3, r6, r7
+	ld	r8, TAS_SEQCOUNT(r5)
+	cmpdi	r8, 0
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	lwsync
+	ld	r7, TAS_TOTAL(r5)
+	add	r7, r7, r3
+	std	r7, TAS_TOTAL(r5)
+	ld	r6, TAS_MIN(r5)
+	ld	r7, TAS_MAX(r5)
+	beq	3f
+	cmpd	r3, r6
+	bge	1f
+3:	std	r3, TAS_MIN(r5)
+1:	cmpd	r3, r7
+	ble	2f
+	std	r3, TAS_MAX(r5)
+2:	lwsync
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	blr
+#endif
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index ce3c893d509b..f2c75a1e0536 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -258,6 +258,28 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_load(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_store(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 {
 	long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -290,6 +312,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
+	case H_LOGICAL_CI_LOAD:
+		return kvmppc_h_pr_logical_ci_load(vcpu);
+	case H_LOGICAL_CI_STORE:
+		return kvmppc_h_pr_logical_ci_store(vcpu);
 	case H_XIRR:
 	case H_CPPR:
 	case H_EOI:
@@ -323,6 +349,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
 	case H_BULK_REMOVE:
 	case H_PUT_TCE:
 	case H_CEDE:
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
 #ifdef CONFIG_KVM_XICS
 	case H_XIRR:
 	case H_CPPR:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index a4a8d9f0dcb7..8f3e6cc54d95 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -20,6 +20,7 @@
 #include <asm/xics.h>
 #include <asm/debug.h>
 #include <asm/time.h>
+#include <asm/spinlock.h>
 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -39,7 +40,7 @@
  * LOCKING
  * =======
  *
- * Each ICS has a mutex protecting the information about the IRQ
+ * Each ICS has a spin lock protecting the information about the IRQ
  * sources and avoiding simultaneous deliveries if the same interrupt.
  *
  * ICP operations are done via a single compare & swap transaction
@@ -109,7 +110,10 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 {
 	int i;
 
-	mutex_lock(&ics->lock);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 		struct ics_irq_state *state = &ics->irq_state[i];
@@ -120,12 +124,15 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 		XICS_DBG("resend %#x prio %#x\n", state->number,
 			      state->priority);
 
-		mutex_unlock(&ics->lock);
+		arch_spin_unlock(&ics->lock);
+		local_irq_restore(flags);
 		icp_deliver_irq(xics, icp, state->number);
-		mutex_lock(&ics->lock);
+		local_irq_save(flags);
+		arch_spin_lock(&ics->lock);
 	}
 
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 }
 
 static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -133,8 +140,10 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 		       u32 server, u32 priority, u32 saved_priority)
 {
 	bool deliver;
+	unsigned long flags;
 
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 
 	state->server = server;
 	state->priority = priority;
@@ -145,7 +154,8 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 		deliver = true;
 	}
 
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	return deliver;
 }
@@ -186,6 +196,7 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
 	struct kvmppc_ics *ics;
 	struct ics_irq_state *state;
 	u16 src;
+	unsigned long flags;
 
 	if (!xics)
 		return -ENODEV;
@@ -195,10 +206,12 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
 		return -EINVAL;
 	state = &ics->irq_state[src];
 
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 	*server = state->server;
 	*priority = state->priority;
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	return 0;
 }
@@ -365,6 +378,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	struct kvmppc_ics *ics;
 	u32 reject;
 	u16 src;
+	unsigned long flags;
 
 	/*
 	 * This is used both for initial delivery of an interrupt and
@@ -391,7 +405,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	state = &ics->irq_state[src];
 
 	/* Get a lock on the ICS */
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 
 	/* Get our server */
 	if (!icp || state->server != icp->server_num) {
@@ -434,7 +449,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	 *
 	 * Note that if successful, the new delivery might have itself
 	 * rejected an interrupt that was "delivered" before we took the
-	 * icp mutex.
+	 * ics spin lock.
 	 *
 	 * In this case we do the whole sequence all over again for the
 	 * new guy. We cannot assume that the rejected interrupt is less
@@ -448,7 +463,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 * Delivery was successful, did we reject somebody else ?
 		 */
 		if (reject && reject != XICS_IPI) {
-			mutex_unlock(&ics->lock);
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
 			new_irq = reject;
 			goto again;
 		}
@@ -468,12 +484,14 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		smp_mb();
 		if (!icp->state.need_resend) {
-			mutex_unlock(&ics->lock);
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
 			goto again;
 		}
 	}
  out:
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 }
 
 static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
@@ -802,14 +820,22 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 	XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
 		 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
 
-	if (icp->rm_action & XICS_RM_KICK_VCPU)
+	if (icp->rm_action & XICS_RM_KICK_VCPU) {
+		icp->n_rm_kick_vcpu++;
 		kvmppc_fast_vcpu_kick(icp->rm_kick_target);
-	if (icp->rm_action & XICS_RM_CHECK_RESEND)
+	}
+	if (icp->rm_action & XICS_RM_CHECK_RESEND) {
+		icp->n_rm_check_resend++;
 		icp_check_resend(xics, icp->rm_resend_icp);
-	if (icp->rm_action & XICS_RM_REJECT)
+	}
+	if (icp->rm_action & XICS_RM_REJECT) {
+		icp->n_rm_reject++;
 		icp_deliver_irq(xics, icp, icp->rm_reject);
-	if (icp->rm_action & XICS_RM_NOTIFY_EOI)
+	}
+	if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
+		icp->n_rm_notify_eoi++;
 		kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
+	}
 
 	icp->rm_action = 0;
 
@@ -872,10 +898,21 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	struct kvm *kvm = xics->kvm;
 	struct kvm_vcpu *vcpu;
 	int icsid, i;
+	unsigned long flags;
+	unsigned long t_rm_kick_vcpu, t_rm_check_resend;
+	unsigned long t_rm_reject, t_rm_notify_eoi;
+	unsigned long t_reject, t_check_resend;
 
 	if (!kvm)
 		return 0;
 
+	t_rm_kick_vcpu = 0;
+	t_rm_notify_eoi = 0;
+	t_rm_check_resend = 0;
+	t_rm_reject = 0;
+	t_check_resend = 0;
+	t_reject = 0;
+
 	seq_printf(m, "=========\nICP state\n=========\n");
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -890,8 +927,19 @@ static int xics_debug_show(struct seq_file *m, void *private)
 			   icp->server_num, state.xisr,
 			   state.pending_pri, state.cppr, state.mfrr,
 			   state.out_ee, state.need_resend);
+		t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
+		t_rm_notify_eoi += icp->n_rm_notify_eoi;
+		t_rm_check_resend += icp->n_rm_check_resend;
+		t_rm_reject += icp->n_rm_reject;
+		t_check_resend += icp->n_check_resend;
+		t_reject += icp->n_reject;
 	}
 
+	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+			t_rm_kick_vcpu, t_rm_check_resend,
+			t_rm_reject, t_rm_notify_eoi);
+	seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
+			t_check_resend, t_reject);
 	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
 		struct kvmppc_ics *ics = xics->ics[icsid];
 
@@ -901,7 +949,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
 		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
 			   icsid);
 
-		mutex_lock(&ics->lock);
+		local_irq_save(flags);
+		arch_spin_lock(&ics->lock);
 
 		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 			struct ics_irq_state *irq = &ics->irq_state[i];
@@ -912,7 +961,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
 				   irq->resend, irq->masked_pending);
 
 		}
-		mutex_unlock(&ics->lock);
+		arch_spin_unlock(&ics->lock);
+		local_irq_restore(flags);
 	}
 	return 0;
 }
@@ -965,7 +1015,6 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
 	if (!ics)
 		goto out;
 
-	mutex_init(&ics->lock);
 	ics->icsid = icsid;
 
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
@@ -1107,13 +1156,15 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	u64 __user *ubufp = (u64 __user *) addr;
 	u16 idx;
 	u64 val, prio;
+	unsigned long flags;
 
 	ics = kvmppc_xics_find_ics(xics, irq, &idx);
 	if (!ics)
 		return -ENOENT;
 
 	irqp = &ics->irq_state[idx];
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 	ret = -ENOENT;
 	if (irqp->exists) {
 		val = irqp->server;
@@ -1129,7 +1180,8 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
 			val |= KVM_XICS_PENDING;
 		ret = 0;
 	}
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	if (!ret && put_user(val, ubufp))
 		ret = -EFAULT;
@@ -1146,6 +1198,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	u64 val;
 	u8 prio;
 	u32 server;
+	unsigned long flags;
 
 	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
 		return -ENOENT;
@@ -1166,7 +1219,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	    kvmppc_xics_find_server(xics->kvm, server) == NULL)
 		return -EINVAL;
 
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 	irqp->server = server;
 	irqp->saved_priority = prio;
 	if (val & KVM_XICS_MASKED)
@@ -1178,7 +1232,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
 		irqp->asserted = 1;
 	irqp->exists = 1;
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	if (val & KVM_XICS_PENDING)
 		icp_deliver_irq(xics, NULL, irqp->number);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 73f0f2723c07..56ea44f9867f 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -78,13 +78,22 @@ struct kvmppc_icp {
 	u32  rm_reject;
 	u32  rm_eoied_irq;
 
+	/* Counters for each reason we exited real mode */
+	unsigned long n_rm_kick_vcpu;
+	unsigned long n_rm_check_resend;
+	unsigned long n_rm_reject;
+	unsigned long n_rm_notify_eoi;
+	/* Counters for handling ICP processing in real mode */
+	unsigned long n_check_resend;
+	unsigned long n_reject;
+
 	/* Debug stuff for real mode */
 	union kvmppc_icp_state rm_dbgstate;
 	struct kvm_vcpu *rm_dbgtgt;
 };
 
 struct kvmppc_ics {
-	struct mutex lock;
+	arch_spinlock_t lock;
 	u16 icsid;
 	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
 };
@@ -96,6 +105,8 @@ struct kvmppc_xics {
 	u32 max_icsid;
 	bool real_mode;
 	bool real_mode_dbg;
+	u32 err_noics;
+	u32 err_noicp;
 	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
 };
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 91bbc845ac66..ac3ddf115f3d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -529,6 +529,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_RMA:
 		r = 0;
 		break;
+	case KVM_CAP_PPC_HWRNG:
+		r = kvmppc_hwrng_present();
+		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE