16 files changed, 1024 insertions, 305 deletions
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index c2024ac9d4e8..029be26b5a17 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,9 @@ config KVM
 	select ANON_INODES
 	select HAVE_KVM_EVENTFD
 	select SRCU
+	select KVM_VFIO
+	select IRQ_BYPASS_MANAGER
+	select HAVE_KVM_IRQ_BYPASS
 
 config KVM_BOOK3S_HANDLER
 	bool
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 1f9e5529e692..7dd89b79d038 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -7,16 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm
 
-common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-		$(KVM)/eventfd.o
+common-objs-y = $(KVM)/kvm_main.o $(KVM)/eventfd.o
 common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
+common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
 
 CFLAGS_e500_mmu.o := -I.
 CFLAGS_e500_mmu_host.o := -I.
 CFLAGS_emulate.o  := -I.
 CFLAGS_emulate_loadstore.o  := -I.
 
-common-objs-y += powerpc.o emulate.o emulate_loadstore.o
+common-objs-y += powerpc.o emulate_loadstore.o
 obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
 
@@ -24,6 +24,7 @@ AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
 
 kvm-e500-objs := \
 	$(common-objs-y) \
+	emulate.o \
 	booke.o \
 	booke_emulate.o \
 	booke_interrupts.o \
@@ -35,6 +36,7 @@ kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs)
 
 kvm-e500mc-objs := \
 	$(common-objs-y) \
+	emulate.o \
 	booke.o \
 	booke_emulate.o \
 	bookehv_interrupts.o \
@@ -61,9 +63,6 @@ kvm-pr-y := \
 	book3s_32_mmu.o
 
 ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-kvm-book3s_64-module-objs := \
-	$(KVM)/coalesced_mmio.o
-
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
 	book3s_rmhandlers.o
 endif
@@ -78,6 +77,7 @@ kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
 
 ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
+	book3s_hv_hmi.o \
 	book3s_hv_rmhandlers.o \
 	book3s_hv_rm_mmu.o \
 	book3s_hv_ras.o \
@@ -88,11 +88,8 @@ endif
 kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
 	book3s_xics.o
 
-kvm-book3s_64-module-objs += \
-	$(KVM)/kvm_main.o \
-	$(KVM)/eventfd.o \
-	powerpc.o \
-	emulate_loadstore.o \
+kvm-book3s_64-module-objs := \
+	$(common-objs-y) \
 	book3s.o \
 	book3s_64_vio.o \
 	book3s_rtas.o \
@@ -102,6 +99,7 @@ kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs)
 
 kvm-book3s_32-objs := \
 	$(common-objs-y) \
+	emulate.o \
 	fpu.o \
 	book3s_paired_singles.o \
 	book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 47018fcbf7d6..b6952dd23152 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "dec",         VCPU_STAT(dec_exits) },
 	{ "ext_intr",    VCPU_STAT(ext_intr_exits) },
 	{ "queue_intr",  VCPU_STAT(queue_intr) },
+	{ "halt_poll_success_ns",	VCPU_STAT(halt_poll_success_ns) },
+	{ "halt_poll_fail_ns",		VCPU_STAT(halt_poll_fail_ns) },
+	{ "halt_wait_ns",		VCPU_STAT(halt_wait_ns) },
 	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
 	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+	{ "halt_successful_wait",	VCPU_STAT(halt_successful_wait) },
 	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "pf_storage",  VCPU_STAT(pf_storage) },
@@ -64,6 +68,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "ld_slow",     VCPU_STAT(ld_slow) },
 	{ "st",          VCPU_STAT(st) },
 	{ "st_slow",     VCPU_STAT(st_slow) },
+	{ "pthru_all",       VCPU_STAT(pthru_all) },
+	{ "pthru_host",      VCPU_STAT(pthru_host) },
+	{ "pthru_bad_aff",   VCPU_STAT(pthru_bad_aff) },
 	{ NULL }
 };
 
@@ -592,9 +599,6 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
 		case KVM_REG_PPC_BESCR:
 			*val = get_reg_val(id, vcpu->arch.bescr);
 			break;
-		case KVM_REG_PPC_VTB:
-			*val = get_reg_val(id, vcpu->arch.vtb);
-			break;
 		case KVM_REG_PPC_IC:
 			*val = get_reg_val(id, vcpu->arch.ic);
 			break;
@@ -666,9 +670,6 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
 		case KVM_REG_PPC_BESCR:
 			vcpu->arch.bescr = set_reg_val(id, *val);
 			break;
-		case KVM_REG_PPC_VTB:
-			vcpu->arch.vtb = set_reg_val(id, *val);
-			break;
 		case KVM_REG_PPC_IC:
 			vcpu->arch.ic = set_reg_val(id, *val);
 			break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 2afdb9c0937d..8359752b3efc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -498,6 +498,7 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
 	case SPRN_MMCR0:
 	case SPRN_MMCR1:
 	case SPRN_MMCR2:
+	case SPRN_UMMCR2:
 #endif
 		break;
 unprivileged:
@@ -579,7 +580,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
 		*spr_val = vcpu->arch.spurr;
 		break;
 	case SPRN_VTB:
-		*spr_val = vcpu->arch.vtb;
+		*spr_val = to_book3s(vcpu)->vtb;
 		break;
 	case SPRN_IC:
 		*spr_val = vcpu->arch.ic;
@@ -640,6 +641,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val
 	case SPRN_MMCR0:
 	case SPRN_MMCR1:
 	case SPRN_MMCR2:
+	case SPRN_UMMCR2:
 	case SPRN_TIR:
 #endif
 		*spr_val = 0;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2fd5580c8f6e..3686471be32b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -53,11 +53,15 @@
 #include <asm/smp.h>
 #include <asm/dbell.h>
 #include <asm/hmi.h>
+#include <asm/pnv-pci.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
 #include <linux/module.h>
+#include <linux/compiler.h>
 
 #include "book3s.h"
 
@@ -70,6 +74,8 @@
 
 /* Used to indicate that a guest page fault needs to be handled */
 #define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH	(RESUME_GUEST | RESUME_FLAG_ARCH2)
 
 /* Used as a "null" value for timebase values */
 #define TB_NIL	(~(u64)0)
@@ -89,14 +95,55 @@ static struct kernel_param_ops module_param_ops = {
 	.get = param_get_int,
 };
 
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
+							S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 							S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+		int *ip)
+{
+	int i = *ip;
+	struct kvm_vcpu *vcpu;
+
+	while (++i < MAX_SMT_THREADS) {
+		vcpu = READ_ONCE(vc->runnable_threads[i]);
+		if (vcpu) {
+			*ip = i;
+			return vcpu;
+		}
+	}
+	return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+	for (i = -1; (vcpu = next_runnable_thread(vc, &i)); )
+
 static bool kvmppc_ipi_thread(int cpu)
 {
 	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
@@ -991,6 +1038,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 		r = RESUME_GUEST;
 		break;
+	case BOOK3S_INTERRUPT_HV_RM_HARD:
+		r = RESUME_PASSTHROUGH;
+		break;
 	default:
 		kvmppc_dump_regs(vcpu);
 		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
@@ -1149,6 +1199,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_DPDES:
 		*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
 		break;
+	case KVM_REG_PPC_VTB:
+		*val = get_reg_val(id, vcpu->arch.vcore->vtb);
+		break;
 	case KVM_REG_PPC_DAWR:
 		*val = get_reg_val(id, vcpu->arch.dawr);
 		break;
@@ -1341,6 +1394,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_DPDES:
 		vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
 		break;
+	case KVM_REG_PPC_VTB:
+		vcpu->arch.vcore->vtb = set_reg_val(id, *val);
+		break;
 	case KVM_REG_PPC_DAWR:
 		vcpu->arch.dawr = set_reg_val(id, *val);
 		break;
@@ -1493,7 +1549,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	if (vcore == NULL)
 		return NULL;
 
-	INIT_LIST_HEAD(&vcore->runnable_threads);
 	spin_lock_init(&vcore->lock);
 	spin_lock_init(&vcore->stoltb_lock);
 	init_swait_queue_head(&vcore->wq);
@@ -1802,7 +1857,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 	spin_unlock_irq(&vcpu->arch.tbacct_lock);
 	--vc->n_runnable;
-	list_del(&vcpu->arch.run_list);
+	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
 }
 
 static int kvmppc_grab_hwthread(int cpu)
@@ -2048,66 +2103,6 @@ static void init_master_vcore(struct kvmppc_vcore *vc)
 	vc->conferring_threads = 0;
 }
 
-/*
- * See if the existing subcores can be split into 3 (or fewer) subcores
- * of at most two threads each, so we can fit in another vcore.  This
- * assumes there are at most two subcores and at most 6 threads in total.
- */
-static bool can_split_piggybacked_subcores(struct core_info *cip)
-{
-	int sub, new_sub;
-	int large_sub = -1;
-	int thr;
-	int n_subcores = cip->n_subcores;
-	struct kvmppc_vcore *vc, *vcnext;
-	struct kvmppc_vcore *master_vc = NULL;
-
-	for (sub = 0; sub < cip->n_subcores; ++sub) {
-		if (cip->subcore_threads[sub] <= 2)
-			continue;
-		if (large_sub >= 0)
-			return false;
-		large_sub = sub;
-		vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
-				      preempt_list);
-		if (vc->num_threads > 2)
-			return false;
-		n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
-	}
-	if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2))
-		return false;
-
-	/*
-	 * Seems feasible, so go through and move vcores to new subcores.
-	 * Note that when we have two or more vcores in one subcore,
-	 * all those vcores must have only one thread each.
-	 */
-	new_sub = cip->n_subcores;
-	thr = 0;
-	sub = large_sub;
-	list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
-		if (thr >= 2) {
-			list_del(&vc->preempt_list);
-			list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
-			/* vc->num_threads must be 1 */
-			if (++cip->subcore_threads[new_sub] == 1) {
-				cip->subcore_vm[new_sub] = vc->kvm;
-				init_master_vcore(vc);
-				master_vc = vc;
-				++cip->n_subcores;
-			} else {
-				vc->master_vcore = master_vc;
-				++new_sub;
-			}
-		}
-		thr += vc->num_threads;
-	}
-	cip->subcore_threads[large_sub] = 2;
-	cip->max_subcore_threads = 2;
-
-	return true;
-}
-
 static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 {
 	int n_threads = vc->num_threads;
@@ -2118,23 +2113,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 
 	if (n_threads < cip->max_subcore_threads)
 		n_threads = cip->max_subcore_threads;
-	if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
-		cip->max_subcore_threads = n_threads;
-	} else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
-		   vc->num_threads <= 2) {
-		/*
-		 * We may be able to fit another subcore in by
-		 * splitting an existing subcore with 3 or 4
-		 * threads into two 2-thread subcores, or one
-		 * with 5 or 6 threads into three subcores.
-		 * We can only do this if those subcores have
-		 * piggybacked virtual cores.
-		 */
-		if (!can_split_piggybacked_subcores(cip))
-			return false;
-	} else {
+	if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
 		return false;
-	}
+	cip->max_subcore_threads = n_threads;
 
 	sub = cip->n_subcores;
 	++cip->n_subcores;
@@ -2148,43 +2129,6 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 	return true;
 }
 
-static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
-				  struct core_info *cip, int sub)
-{
-	struct kvmppc_vcore *vc;
-	int n_thr;
-
-	vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
-			      preempt_list);
-
-	/* require same VM and same per-core reg values */
-	if (pvc->kvm != vc->kvm ||
-	    pvc->tb_offset != vc->tb_offset ||
-	    pvc->pcr != vc->pcr ||
-	    pvc->lpcr != vc->lpcr)
-		return false;
-
-	/* P8 guest with > 1 thread per core would see wrong TIR value */
-	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
-	    (vc->num_threads > 1 || pvc->num_threads > 1))
-		return false;
-
-	n_thr = cip->subcore_threads[sub] + pvc->num_threads;
-	if (n_thr > cip->max_subcore_threads) {
-		if (!subcore_config_ok(cip->n_subcores, n_thr))
-			return false;
-		cip->max_subcore_threads = n_thr;
-	}
-
-	cip->total_threads += pvc->num_threads;
-	cip->subcore_threads[sub] = n_thr;
-	pvc->master_vcore = vc;
-	list_del(&pvc->preempt_list);
-	list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
-
-	return true;
-}
-
 /*
  * Work out whether it is possible to piggyback the execution of
  * vcore *pvc onto the execution of the other vcores described in *cip.
@@ -2192,27 +2136,18 @@ static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
 static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
 			  int target_threads)
 {
-	int sub;
-
 	if (cip->total_threads + pvc->num_threads > target_threads)
 		return false;
-	for (sub = 0; sub < cip->n_subcores; ++sub)
-		if (cip->subcore_threads[sub] &&
-		    can_piggyback_subcore(pvc, cip, sub))
-			return true;
-
-	if (can_dynamic_split(pvc, cip))
-		return true;
 
-	return false;
+	return can_dynamic_split(pvc, cip);
 }
 
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vnext;
+	int i;
+	struct kvm_vcpu *vcpu;
 
-	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-				 arch.run_list) {
+	for_each_runnable_thread(i, vcpu, vc) {
 		if (signal_pending(vcpu->arch.run_task))
 			vcpu->arch.ret = -EINTR;
 		else if (vcpu->arch.vpa.update_pending ||
@@ -2259,15 +2194,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
 
 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
-	int still_running = 0;
+	int still_running = 0, i;
 	u64 now;
 	long ret;
-	struct kvm_vcpu *vcpu, *vnext;
+	struct kvm_vcpu *vcpu;
 
 	spin_lock(&vc->lock);
 	now = get_tb();
-	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-				 arch.run_list) {
+	for_each_runnable_thread(i, vcpu, vc) {
 		/* cancel pending dec exception if dec is positive */
 		if (now < vcpu->arch.dec_expires &&
 		    kvmppc_core_pending_dec(vcpu))
@@ -2307,8 +2241,8 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 		}
 		if (vc->n_runnable > 0 && vc->runner == NULL) {
 			/* make sure there's a candidate runner awake */
-			vcpu = list_first_entry(&vc->runnable_threads,
-						struct kvm_vcpu, arch.run_list);
+			i = -1;
+			vcpu = next_runnable_thread(vc, &i);
 			wake_up(&vcpu->arch.cpu_run);
 		}
 	}
@@ -2361,7 +2295,7 @@ static inline void kvmppc_set_host_core(int cpu)
  */
 static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vnext;
+	struct kvm_vcpu *vcpu;
 	int i;
 	int srcu_idx;
 	struct core_info core_info;
@@ -2397,8 +2331,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	 */
 	if ((threads_per_core > 1) &&
 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-		list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-					 arch.run_list) {
+		for_each_runnable_thread(i, vcpu, vc) {
 			vcpu->arch.ret = -EBUSY;
 			kvmppc_remove_runnable(vc, vcpu);
 			wake_up(&vcpu->arch.cpu_run);
@@ -2477,8 +2410,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		active |= 1 << thr;
 		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
 			pvc->pcpu = pcpu + thr;
-			list_for_each_entry(vcpu, &pvc->runnable_threads,
-					    arch.run_list) {
+			for_each_runnable_thread(i, vcpu, pvc) {
 				kvmppc_start_thread(vcpu, pvc);
 				kvmppc_create_dtl_entry(vcpu, pvc);
 				trace_kvm_guest_enter(vcpu);
@@ -2604,34 +2536,92 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
 	finish_wait(&vcpu->arch.cpu_run, &wait);
 }
 
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+	/* 10us base */
+	if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+		vc->halt_poll_ns = 10000;
+	else
+		vc->halt_poll_ns *= halt_poll_ns_grow;
+
+	if (vc->halt_poll_ns > halt_poll_max_ns)
+		vc->halt_poll_ns = halt_poll_max_ns;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+	if (halt_poll_ns_shrink == 0)
+		vc->halt_poll_ns = 0;
+	else
+		vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	for_each_runnable_thread(i, vcpu, vc) {
+		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * All the vcpus in this vcore are idle, so wait for a decrementer
  * or external interrupt to one of the vcpus.  vc->lock is held.
  */
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu;
+	ktime_t cur, start_poll, start_wait;
 	int do_sleep = 1;
+	u64 block_ns;
 	DECLARE_SWAITQUEUE(wait);
 
-	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+	/* Poll for pending exceptions and ceded state */
+	cur = start_poll = ktime_get();
+	if (vc->halt_poll_ns) {
+		ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+		++vc->runner->stat.halt_attempted_poll;
 
-	/*
-	 * Check one last time for pending exceptions and ceded state after
-	 * we put ourselves on the wait queue
-	 */
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
-			do_sleep = 0;
-			break;
+		vc->vcore_state = VCORE_POLLING;
+		spin_unlock(&vc->lock);
+
+		do {
+			if (kvmppc_vcore_check_block(vc)) {
+				do_sleep = 0;
+				break;
+			}
+			cur = ktime_get();
+		} while (single_task_running() && ktime_before(cur, stop));
+
+		spin_lock(&vc->lock);
+		vc->vcore_state = VCORE_INACTIVE;
+
+		if (!do_sleep) {
+			++vc->runner->stat.halt_successful_poll;
+			goto out;
 		}
 	}
 
-	if (!do_sleep) {
+	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+
+	if (kvmppc_vcore_check_block(vc)) {
 		finish_swait(&vc->wq, &wait);
-		return;
+		do_sleep = 0;
+		/* If we polled, count this as a successful poll */
+		if (vc->halt_poll_ns)
+			++vc->runner->stat.halt_successful_poll;
+		goto out;
 	}
 
+	start_wait = ktime_get();
+
 	vc->vcore_state = VCORE_SLEEPING;
 	trace_kvmppc_vcore_blocked(vc, 0);
 	spin_unlock(&vc->lock);
@@ -2640,13 +2630,52 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
 	trace_kvmppc_vcore_blocked(vc, 1);
+	++vc->runner->stat.halt_successful_wait;
+
+	cur = ktime_get();
+
+out:
+	block_ns = ktime_to_ns(cur) - ktime_to_ns(start_poll);
+
+	/* Attribute wait time */
+	if (do_sleep) {
+		vc->runner->stat.halt_wait_ns +=
+			ktime_to_ns(cur) - ktime_to_ns(start_wait);
+		/* Attribute failed poll time */
+		if (vc->halt_poll_ns)
+			vc->runner->stat.halt_poll_fail_ns +=
+				ktime_to_ns(start_wait) -
+				ktime_to_ns(start_poll);
+	} else {
+		/* Attribute successful poll time */
+		if (vc->halt_poll_ns)
+			vc->runner->stat.halt_poll_success_ns +=
+				ktime_to_ns(cur) -
+				ktime_to_ns(start_poll);
+	}
+
+	/* Adjust poll time */
+	if (halt_poll_max_ns) {
+		if (block_ns <= vc->halt_poll_ns)
+			;
+		/* We slept and blocked for longer than the max halt time */
+		else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
+			shrink_halt_poll_ns(vc);
+		/* We slept and our poll time is too small */
+		else if (vc->halt_poll_ns < halt_poll_max_ns &&
+				block_ns < halt_poll_max_ns)
+			grow_halt_poll_ns(vc);
+	} else
+		vc->halt_poll_ns = 0;
+
+	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-	int n_ceded;
+	int n_ceded, i;
 	struct kvmppc_vcore *vc;
-	struct kvm_vcpu *v, *vn;
+	struct kvm_vcpu *v;
 
 	trace_kvmppc_run_vcpu_enter(vcpu);
 
@@ -2666,7 +2695,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
 	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
 	vcpu->arch.busy_preempt = TB_NIL;
-	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
+	WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], vcpu);
 	++vc->n_runnable;
 
 	/*
@@ -2706,8 +2735,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
 			continue;
 		}
-		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
-					 arch.run_list) {
+		for_each_runnable_thread(i, v, vc) {
 			kvmppc_core_prepare_to_enter(v);
 			if (signal_pending(v->arch.run_task)) {
 				kvmppc_remove_runnable(vc, v);
@@ -2720,7 +2748,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 			break;
 		n_ceded = 0;
-		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
+		for_each_runnable_thread(i, v, vc) {
 			if (!v->arch.pending_exceptions)
 				n_ceded += v->arch.ceded;
 			else
@@ -2759,8 +2787,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
 		/* Wake up some vcpu to run the core */
-		v = list_first_entry(&vc->runnable_threads,
-				     struct kvm_vcpu, arch.run_list);
+		i = -1;
+		v = next_runnable_thread(vc, &i);
 		wake_up(&v->arch.cpu_run);
 	}
 
@@ -2818,7 +2846,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			r = kvmppc_book3s_hv_page_fault(run, vcpu,
 				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
-		}
+		} else if (r == RESUME_PASSTHROUGH)
+			r = kvmppc_xics_rm_complete(vcpu, 0);
 	} while (is_kvmppc_resume_guest(r));
 
  out:
@@ -3247,6 +3276,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 	kvmppc_free_vcores(kvm);
 
 	kvmppc_free_hpt(kvm);
+
+	kvmppc_free_pimap(kvm);
 }
 
 /* We don't need to emulate any privileged instructions or dcbz */
@@ -3282,6 +3313,184 @@ static int kvmppc_core_check_processor_compat_hv(void)
 	return 0;
 }
 
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+	kfree(kvm->arch.pimap);
+}
+
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+	return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+	struct irq_desc *desc;
+	struct kvmppc_irq_map *irq_map;
+	struct kvmppc_passthru_irqmap *pimap;
+	struct irq_chip *chip;
+	int i;
+
+	if (!kvm_irq_bypass)
+		return 1;
+
+	desc = irq_to_desc(host_irq);
+	if (!desc)
+		return -EIO;
+
+	mutex_lock(&kvm->lock);
+
+	pimap = kvm->arch.pimap;
+	if (pimap == NULL) {
+		/* First call, allocate structure to hold IRQ map */
+		pimap = kvmppc_alloc_pimap();
+		if (pimap == NULL) {
+			mutex_unlock(&kvm->lock);
+			return -ENOMEM;
+		}
+		kvm->arch.pimap = pimap;
+	}
+
+	/*
+	 * For now, we only support interrupts for which the EOI operation
+	 * is an OPAL call followed by a write to XIRR, since that's
+	 * what our real-mode EOI code does.
+	 */
+	chip = irq_data_get_irq_chip(&desc->irq_data);
+	if (!chip || !is_pnv_opal_msi(chip)) {
+		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
+			host_irq, guest_gsi);
+		mutex_unlock(&kvm->lock);
+		return -ENOENT;
+	}
+
+	/*
+	 * See if we already have an entry for this guest IRQ number.
+	 * If it's mapped to a hardware IRQ number, that's an error,
+	 * otherwise re-use this entry.
+	 */
+	for (i = 0; i < pimap->n_mapped; i++) {
+		if (guest_gsi == pimap->mapped[i].v_hwirq) {
+			if (pimap->mapped[i].r_hwirq) {
+				mutex_unlock(&kvm->lock);
+				return -EINVAL;
+			}
+			break;
+		}
+	}
+
+	if (i == KVMPPC_PIRQ_MAPPED) {
+		mutex_unlock(&kvm->lock);
+		return -EAGAIN;		/* table is full */
+	}
+
+	irq_map = &pimap->mapped[i];
+
+	irq_map->v_hwirq = guest_gsi;
+	irq_map->desc = desc;
+
+	/*
+	 * Order the above two stores before the next to serialize with
+	 * the KVM real mode handler.
+	 */
+	smp_wmb();
+	irq_map->r_hwirq = desc->irq_data.hwirq;
+
+	if (i == pimap->n_mapped)
+		pimap->n_mapped++;
+
+	kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+
+	mutex_unlock(&kvm->lock);
+
+	return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
+{
+	struct irq_desc *desc;
+	struct kvmppc_passthru_irqmap *pimap;
+	int i;
+
+	if (!kvm_irq_bypass)
+		return 0;
+
+	desc = irq_to_desc(host_irq);
+	if (!desc)
+		return -EIO;
+
+	mutex_lock(&kvm->lock);
+
+	if (kvm->arch.pimap == NULL) {
+		mutex_unlock(&kvm->lock);
+		return 0;
+	}
+	pimap = kvm->arch.pimap;
+
+	for (i = 0; i < pimap->n_mapped; i++) {
+		if (guest_gsi == pimap->mapped[i].v_hwirq)
+			break;
+	}
+
+	if (i == pimap->n_mapped) {
+		mutex_unlock(&kvm->lock);
+		return -ENODEV;
+	}
+
+	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
+	/* invalidate the entry */
+	pimap->mapped[i].r_hwirq = 0;
+
+	/*
+	 * We don't free this structure even when the count goes to
+	 * zero. The structure is freed when we destroy the VM.
+	 */
+
+	mutex_unlock(&kvm->lock);
+	return 0;
+}
+
+static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
+					     struct irq_bypass_producer *prod)
+{
+	int ret = 0;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	irqfd->producer = prod;
+
+	ret = kvmppc_set_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+	if (ret)
+		pr_info("kvmppc_set_passthru_irq (irq %d, gsi %d) fails: %d\n",
+			prod->irq, irqfd->gsi, ret);
+
+	return ret;
+}
+
+static void kvmppc_irq_bypass_del_producer_hv(struct irq_bypass_consumer *cons,
+					      struct irq_bypass_producer *prod)
+{
+	int ret;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	irqfd->producer = NULL;
+
+	/*
+	 * When producer of consumer is unregistered, we change back to
+	 * default external interrupt handling mode - KVM real mode
+	 * will switch back to host.
+	 */
+	ret = kvmppc_clr_passthru_irq(irqfd->kvm, prod->irq, irqfd->gsi);
+	if (ret)
+		pr_warn("kvmppc_clr_passthru_irq (irq %d, gsi %d) fails: %d\n",
+			prod->irq, irqfd->gsi, ret);
+}
+#endif
+
 static long kvm_arch_vm_ioctl_hv(struct file *filp,
 				 unsigned int ioctl, unsigned long arg)
 {
@@ -3400,6 +3609,10 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
 	.arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
 	.hcall_implemented = kvmppc_hcall_impl_hv,
+#ifdef CONFIG_KVM_XICS
+	.irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv,
+	.irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv,
+#endif
 };
 
 static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5f0380db3eab..0c84d6bc8356 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,7 @@
 #include <asm/xics.h>
 #include <asm/dbell.h>
 #include <asm/cputhreads.h>
+#include <asm/io.h>
 
 #define KVM_CMA_CHUNK_ORDER	18
 
@@ -286,3 +287,158 @@ void kvmhv_commence_exit(int trap)
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
 EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+#ifdef CONFIG_KVM_XICS
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+					 u32 xisr)
+{
+	int i;
+
+	/*
+	 * We access the mapped array here without a lock.  That
+	 * is safe because we never reduce the number of entries
+	 * in the array and we never change the v_hwirq field of
+	 * an entry once it is set.
+	 *
+	 * We have also carefully ordered the stores in the writer
+	 * and the loads here in the reader, so that if we find a matching
+	 * hwirq here, the associated GSI and irq_desc fields are valid.
+	 */
+	for (i = 0; i < pimap->n_mapped; i++)  {
+		if (xisr == pimap->mapped[i].r_hwirq) {
+			/*
+			 * Order subsequent reads in the caller to serialize
+			 * with the writer.
+			 */
+			smp_rmb();
+			return &pimap->mapped[i];
+		}
+	}
+	return NULL;
+}
+
+/*
+ * If we have an interrupt that's not an IPI, check if we have a
+ * passthrough adapter and if so, check if this external interrupt
+ * is for the adapter.
+ * We will attempt to deliver the IRQ directly to the target VCPU's
+ * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+ *
+ * If the delivery fails or if this is not for a passthrough adapter,
+ * return to the host to handle this interrupt. We earlier
+ * saved a copy of the XIRR in the PACA, it will be picked up by
+ * the host ICP driver.
+ */
+static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+	struct kvmppc_passthru_irqmap *pimap;
+	struct kvmppc_irq_map *irq_map;
+	struct kvm_vcpu *vcpu;
+
+	vcpu = local_paca->kvm_hstate.kvm_vcpu;
+	if (!vcpu)
+		return 1;
+	pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
+	if (!pimap)
+		return 1;
+	irq_map = get_irqmap(pimap, xisr);
+	if (!irq_map)
+		return 1;
+
+	/* We're handling this interrupt, generic code doesn't need to */
+	local_paca->kvm_hstate.saved_xirr = 0;
+
+	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
+}
+
+#else
+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
+{
+	return 1;
+}
+#endif
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ *	0 if no interrupt is pending
+ *	1 if an interrupt is pending that needs to be handled by the host
+ *	2 Passthrough that needs completion in the host
+ *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ *	-2 if there is PCI passthrough external interrupt that was handled
+ */
+
+long kvmppc_read_intr(void)
+{
+	unsigned long xics_phys;
+	u32 h_xirr;
+	__be32 xirr;
+	u32 xisr;
+	u8 host_ipi;
+
+	/* see if a host IPI is pending */
+	host_ipi = local_paca->kvm_hstate.host_ipi;
+	if (host_ipi)
+		return 1;
+
+	/* Now read the interrupt from the ICP */
+	xics_phys = local_paca->kvm_hstate.xics_phys;
+	if (unlikely(!xics_phys))
+		return 1;
+
+	/*
+	 * Save XIRR for later. Since we get control in reverse endian
+	 * on LE systems, save it byte reversed and fetch it back in
+	 * host endian. Note that xirr is the value read from the
+	 * XIRR register, while h_xirr is the host endian version.
+	 */
+	xirr = _lwzcix(xics_phys + XICS_XIRR);
+	h_xirr = be32_to_cpu(xirr);
+	local_paca->kvm_hstate.saved_xirr = h_xirr;
+	xisr = h_xirr & 0xffffff;
+	/*
+	 * Ensure that the store/load complete to guarantee all side
+	 * effects of loading from XIRR has completed
+	 */
+	smp_mb();
+
+	/* if nothing pending in the ICP */
+	if (!xisr)
+		return 0;
+
+	/* We found something in the ICP...
+	 *
+	 * If it is an IPI, clear the MFRR and EOI it.
+	 */
+	if (xisr == XICS_IPI) {
+		_stbcix(xics_phys + XICS_MFRR, 0xff);
+		_stwcix(xics_phys + XICS_XIRR, xirr);
+		/*
+		 * Need to ensure side effects of above stores
+		 * complete before proceeding.
+		 */
+		smp_mb();
+
+		/*
+		 * We need to re-check host IPI now in case it got set in the
+		 * meantime. If it's clear, we bounce the interrupt to the
+		 * guest
+		 */
+		host_ipi = local_paca->kvm_hstate.host_ipi;
+		if (unlikely(host_ipi != 0)) {
+			/* We raced with the host,
+			 * we need to resend that IPI, bummer
+			 */
+			_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+			/* Let side effects complete */
+			smp_mb();
+			return 1;
+		}
+
+		/* OK, it's an IPI for us */
+		local_paca->kvm_hstate.saved_xirr = 0;
+		return -1;
+	}
+
+	return kvmppc_check_passthru(xisr, xirr);
+}
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
new file mode 100644
index 000000000000..e3f738eb1cac
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+	int i;
+
+	/*
+	 * NULL bitmap pointer indicates that KVM module hasn't
+	 * been loaded yet and hence no guests are running.
+	 * If no KVM is in use, no need to co-ordinate among threads
+	 * as all of them will always be in host and no one is going
+	 * to modify TB other than the opal hmi handler.
+	 * Hence, just return from here.
+	 */
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+		while (local_paca->sibling_subcore_state->in_guest[i])
+			cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+	if (!local_paca->sibling_subcore_state)
+		return;
+
+	while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+				&local_paca->sibling_subcore_state->flags))
+		cpu_relax();
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 980d8a6f7284..82ff5de8b1e7 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -10,6 +10,7 @@
 #include <linux/kernel.h>
 #include <linux/kvm_host.h>
 #include <linux/err.h>
+#include <linux/kernel_stat.h>
 
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_ppc.h>
@@ -18,7 +19,10 @@
 #include <asm/debug.h>
 #include <asm/synch.h>
 #include <asm/cputhreads.h>
+#include <asm/pgtable.h>
 #include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
+#include <asm/opal.h>
 
 #include "book3s_xics.h"
 
@@ -26,9 +30,12 @@
 
 int h_ipi_redirect = 1;
 EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			    u32 new_irq);
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu);
 
 /* -- ICS routines -- */
 static void ics_rm_check_resend(struct kvmppc_xics *xics,
@@ -708,10 +715,123 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 		icp->rm_action |= XICS_RM_NOTIFY_EOI;
 		icp->rm_eoied_irq = irq;
 	}
+
+	if (state->host_irq) {
+		++vcpu->stat.pthru_all;
+		if (state->intr_cpu != -1) {
+			int pcpu = raw_smp_processor_id();
+
+			pcpu = cpu_first_thread_sibling(pcpu);
+			++vcpu->stat.pthru_host;
+			if (state->intr_cpu != pcpu) {
+				++vcpu->stat.pthru_bad_aff;
+				xics_opal_rm_set_server(state->host_irq, pcpu);
+			}
+			state->intr_cpu = -1;
+		}
+	}
  bail:
 	return check_too_hard(xics, icp);
 }
 
+unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+{
+	unsigned long xics_phys;
+	int64_t rc;
+
+	rc = pnv_opal_pci_msi_eoi(c, hwirq);
+
+	if (rc)
+		eoi_rc = rc;
+
+	iosync();
+
+	/* EOI it */
+	xics_phys = local_paca->kvm_hstate.xics_phys;
+	_stwcix(xics_phys + XICS_XIRR, xirr);
+}
+
+static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
+{
+	unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2;
+
+	return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY);
+}
+
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+	unsigned long l;
+	unsigned int *raddr;
+	int cpu = smp_processor_id();
+
+	raddr = per_cpu_ptr(addr, cpu);
+	l = (unsigned long)raddr;
+
+	if (REGION_ID(l) == VMALLOC_REGION_ID) {
+		l = vmalloc_to_phys(raddr);
+		raddr = (unsigned int *)l;
+	}
+	++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ *  - state flags represent internal IRQ state and are not expected to be
+ *    updated outside the IRQ subsystem
+ *  - more importantly, these are useful for edge triggered interrupts,
+ *    IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ *    and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+	this_cpu_inc_rm(desc->kstat_irqs);
+	__this_cpu_inc(kstat.irqs_sum);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+				 u32 xirr,
+				 struct kvmppc_irq_map *irq_map,
+				 struct kvmppc_passthru_irqmap *pimap)
+{
+	struct kvmppc_xics *xics;
+	struct kvmppc_icp *icp;
+	u32 irq;
+
+	irq = irq_map->v_hwirq;
+	xics = vcpu->kvm->arch.xics;
+	icp = vcpu->arch.icp;
+
+	kvmppc_rm_handle_irq_desc(irq_map->desc);
+	icp_rm_deliver_irq(xics, icp, irq);
+
+	/* EOI the interrupt */
+	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
+
+	if (check_too_hard(xics, icp) == H_TOO_HARD)
+		return 2;
+	else
+		return -2;
+}
+
 /*  --- Non-real mode XICS-related built-in routines ---  */
 
 /**
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 975655573844..c3c1d1bcfc67 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -221,6 +221,13 @@ kvmppc_primary_no_guest:
 	li	r3, 0		/* Don't wake on privileged (OS) doorbell */
 	b	kvm_do_nap
 
+/*
+ * kvm_novcpu_wakeup
+ *	Entered from kvm_start_guest if kvm_hstate.napping is set
+ *	to NAPPING_NOVCPU
+ *		r2 = kernel TOC
+ *		r13 = paca
+ */
 kvm_novcpu_wakeup:
 	ld	r1, HSTATE_HOST_R1(r13)
 	ld	r5, HSTATE_KVM_VCORE(r13)
@@ -230,6 +237,13 @@ kvm_novcpu_wakeup:
 	/* check the wake reason */
 	bl	kvmppc_check_wake_reason
 
+	/*
+	 * Restore volatile registers since we could have called
+	 * a C routine in kvmppc_check_wake_reason.
+	 *	r5 = VCORE
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+
 	/* see if any other thread is already exiting */
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
@@ -322,6 +336,11 @@ kvm_start_guest:
 
 	/* Check the wake reason in SRR1 to see why we got here */
 	bl	kvmppc_check_wake_reason
+	/*
+	 * kvmppc_check_wake_reason could invoke a C routine, but we
+	 * have no volatile registers to restore when we return.
+	 */
+
 	cmpdi	r3, 0
 	bge	kvm_no_guest
 
@@ -625,9 +644,11 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
 38:
 
 BEGIN_FTR_SECTION
-	/* DPDES is shared between threads */
+	/* DPDES and VTB are shared between threads */
 	ld	r8, VCORE_DPDES(r5)
+	ld	r7, VCORE_VTB(r5)
 	mtspr	SPRN_DPDES, r8
+	mtspr	SPRN_VTB, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	/* Mark the subcore state as inside guest */
@@ -787,10 +808,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_CIABR, r7
 	mtspr	SPRN_TAR, r8
 	ld	r5, VCPU_IC(r4)
-	ld	r6, VCPU_VTB(r4)
-	mtspr	SPRN_IC, r5
-	mtspr	SPRN_VTB, r6
 	ld	r8, VCPU_EBBHR(r4)
+	mtspr	SPRN_IC, r5
 	mtspr	SPRN_EBBHR, r8
 	ld	r5, VCPU_EBBRR(r4)
 	ld	r6, VCPU_BESCR(r4)
@@ -881,6 +900,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	cmpwi	r3, 512		/* 1 microsecond */
 	blt	hdec_soon
 
+deliver_guest_interrupt:
 	ld	r6, VCPU_CTR(r4)
 	ld	r7, VCPU_XER(r4)
 
@@ -895,7 +915,6 @@ kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */
 	mtspr	SPRN_SRR0, r6
 	mtspr	SPRN_SRR1, r7
 
-deliver_guest_interrupt:
 	/* r11 = vcpu->arch.msr & ~MSR_HV */
 	rldicl	r11, r11, 63 - MSR_HV_LG, 1
 	rotldi	r11, r11, 1 + MSR_HV_LG
@@ -1155,10 +1174,54 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 * set, we know the host wants us out so let's do it now
 	 */
 	bl	kvmppc_read_intr
+
+	/*
+	 * Restore the active volatile registers after returning from
+	 * a C function.
+	 */
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+
+	/*
+	 * kvmppc_read_intr return codes:
+	 *
+	 * Exit to host (r3 > 0)
+	 *   1 An interrupt is pending that needs to be handled by the host
+	 *     Exit guest and return to host by branching to guest_exit_cont
+	 *
+	 *   2 Passthrough that needs completion in the host
+	 *     Exit guest and return to host by branching to guest_exit_cont
+	 *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+	 *     to indicate to the host to complete handling the interrupt
+	 *
+	 * Before returning to guest, we check if any CPU is heading out
+	 * to the host and if so, we head out also. If no CPUs are heading
+	 * check return values <= 0.
+	 *
+	 * Return to guest (r3 <= 0)
+	 *  0 No external interrupt is pending
+	 * -1 A guest wakeup IPI (which has now been cleared)
+	 *    In either case, we return to guest to deliver any pending
+	 *    guest interrupts.
+	 *
+	 * -2 A PCI passthrough external interrupt was handled
+	 *    (interrupt was delivered directly to guest)
+	 *    Return to guest to deliver any pending guest interrupts.
+	 */
+
+	cmpdi	r3, 1
+	ble	1f
+
+	/* Return code = 2 */
+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
+	stw	r12, VCPU_TRAP(r9)
+	b	guest_exit_cont
+
+1:	/* Return code <= 1 */
 	cmpdi	r3, 0
 	bgt	guest_exit_cont
 
-	/* Check if any CPU is heading out to the host, if so head out too */
+	/* Return code <= 0 */
 4:	ld	r5, HSTATE_KVM_VCORE(r13)
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
@@ -1271,10 +1334,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	stw	r6, VCPU_PSPB(r9)
 	std	r7, VCPU_FSCR(r9)
 	mfspr	r5, SPRN_IC
-	mfspr	r6, SPRN_VTB
 	mfspr	r7, SPRN_TAR
 	std	r5, VCPU_IC(r9)
-	std	r6, VCPU_VTB(r9)
 	std	r7, VCPU_TAR(r9)
 	mfspr	r8, SPRN_EBBHR
 	std	r8, VCPU_EBBHR(r9)
@@ -1501,9 +1562,11 @@ kvmhv_switch_to_host:
 	isync
 
 BEGIN_FTR_SECTION
-	/* DPDES is shared between threads */
+	/* DPDES and VTB are shared between threads */
 	mfspr	r7, SPRN_DPDES
+	mfspr	r8, SPRN_VTB
 	std	r7, VCORE_DPDES(r5)
+	std	r8, VCORE_VTB(r5)
 	/* clear DPDES so we don't get guest doorbells in the host */
 	li	r8, 0
 	mtspr	SPRN_DPDES, r8
@@ -2213,10 +2276,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 	ld	r29, VCPU_GPR(R29)(r4)
 	ld	r30, VCPU_GPR(R30)(r4)
 	ld	r31, VCPU_GPR(R31)(r4)
- 
+
 	/* Check the wake reason in SRR1 to see why we got here */
 	bl	kvmppc_check_wake_reason
 
+	/*
+	 * Restore volatile registers since we could have called a
+	 * C routine in kvmppc_check_wake_reason
+	 *	r4 = VCPU
+	 * r3 tells us whether we need to return to host or not
+	 * WARNING: it gets checked further down:
+	 * should not modify r3 until this check is done.
+	 */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+
 	/* clear our bit in vcore->napping_threads */
 34:	ld	r5,HSTATE_KVM_VCORE(r13)
 	lbz	r7,HSTATE_PTID(r13)
@@ -2230,7 +2303,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 	li	r0,0
 	stb	r0,HSTATE_NAPPING(r13)
 
-	/* See if the wake reason means we need to exit */
+	/* See if the wake reason saved in r3 means we need to exit */
 	stw	r12, VCPU_TRAP(r4)
 	mr	r9, r4
 	cmpdi	r3, 0
@@ -2297,10 +2370,14 @@ machine_check_realmode:
  *	0 if nothing needs to be done
  *	1 if something happened that needs to be handled by the host
  *	-1 if there was a guest wakeup (IPI or msgsnd)
+ *	-2 if we handled a PCI passthrough interrupt (returned by
+ *		kvmppc_read_intr only)
  *
  * Also sets r12 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
- * Modifies r0, r6, r7, r8.
+ * Modifies all volatile registers (since it may call a C function).
+ * This routine calls kvmppc_read_intr, a C function, if an external
+ * interrupt is pending.
  */
 kvmppc_check_wake_reason:
 	mfspr	r6, SPRN_SRR1
@@ -2310,8 +2387,7 @@ FTR_SECTION_ELSE
 	rlwinm	r6, r6, 45-31, 0xe	/* P7 wake reason field is 3 bits */
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
 	cmpwi	r6, 8			/* was it an external interrupt? */
-	li	r12, BOOK3S_INTERRUPT_EXTERNAL
-	beq	kvmppc_read_intr	/* if so, see what it was */
+	beq	7f			/* if so, see what it was */
 	li	r3, 0
 	li	r12, 0
 	cmpwi	r6, 6			/* was it the decrementer? */
@@ -2350,83 +2426,28 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	li	r3, 1
 	blr
 
-/*
- * Determine what sort of external interrupt is pending (if any).
- * Returns:
- *	0 if no interrupt is pending
- *	1 if an interrupt is pending that needs to be handled by the host
- *	-1 if there was a guest wakeup IPI (which has now been cleared)
- * Modifies r0, r6, r7, r8, returns value in r3.
- */
-kvmppc_read_intr:
-	/* see if a host IPI is pending */
-	li	r3, 1
-	lbz	r0, HSTATE_HOST_IPI(r13)
-	cmpwi	r0, 0
-	bne	1f
+	/* external interrupt - create a stack frame so we can call C */
+7:	mflr	r0
+	std	r0, PPC_LR_STKOFF(r1)
+	stdu	r1, -PPC_MIN_STKFRM(r1)
+	bl	kvmppc_read_intr
+	nop
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	cmpdi	r3, 1
+	ble	1f
 
-	/* Now read the interrupt from the ICP */
-	ld	r6, HSTATE_XICS_PHYS(r13)
-	li	r7, XICS_XIRR
-	cmpdi	r6, 0
-	beq-	1f
-	lwzcix	r0, r6, r7
 	/*
-	 * Save XIRR for later. Since we get in in reverse endian on LE
-	 * systems, save it byte reversed and fetch it back in host endian.
-	 */
-	li	r3, HSTATE_SAVED_XIRR
-	STWX_BE	r0, r3, r13
-#ifdef __LITTLE_ENDIAN__
-	lwz	r3, HSTATE_SAVED_XIRR(r13)
-#else
-	mr	r3, r0
-#endif
-	rlwinm.	r3, r3, 0, 0xffffff
-	sync
-	beq	1f			/* if nothing pending in the ICP */
-
-	/* We found something in the ICP...
-	 *
-	 * If it's not an IPI, stash it in the PACA and return to
-	 * the host, we don't (yet) handle directing real external
-	 * interrupts directly to the guest
+	 * Return code of 2 means PCI passthrough interrupt, but
+	 * we need to return back to host to complete handling the
+	 * interrupt. Trap reason is expected in r12 by guest
+	 * exit code.
 	 */
-	cmpwi	r3, XICS_IPI		/* if there is, is it an IPI? */
-	bne	42f
-
-	/* It's an IPI, clear the MFRR and EOI it */
-	li	r3, 0xff
-	li	r8, XICS_MFRR
-	stbcix	r3, r6, r8		/* clear the IPI */
-	stwcix	r0, r6, r7		/* EOI it */
-	sync
-
-	/* We need to re-check host IPI now in case it got set in the
-	 * meantime. If it's clear, we bounce the interrupt to the
-	 * guest
-	 */
-	lbz	r0, HSTATE_HOST_IPI(r13)
-	cmpwi	r0, 0
-	bne-	43f
-
-	/* OK, it's an IPI for us */
-	li	r12, 0
-	li	r3, -1
-1:	blr
-
-42:	/* It's not an IPI and it's for the host. We saved a copy of XIRR in
-	 * the PACA earlier, it will be picked up by the host ICP driver
-	 */
-	li	r3, 1
-	b	1b
-
-43:	/* We raced with the host, we need to resend that IPI, bummer */
-	li	r0, IPI_PRIORITY
-	stbcix	r0, r6, r8		/* set the IPI */
-	sync
-	li	r3, 1
-	b	1b
+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
+1:
+	ld	r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1)
+	addi	r1, r1, PPC_MIN_STKFRM
+	mtlr	r0
+	blr
 
 /*
  * Save away FP, VMX and VSX registers.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index e76f79a45988..826c541a12af 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -226,7 +226,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
 	 */
 	vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb;
 	vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb;
-	vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb;
+	to_book3s(vcpu)->vtb += get_vtb() - vcpu->arch.entry_vtb;
 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
 		vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
 	svcpu->in_use = false;
@@ -448,6 +448,8 @@ void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr)
 	case PVR_POWER7:
 	case PVR_POWER7p:
 	case PVR_POWER8:
+	case PVR_POWER8E:
+	case PVR_POWER8NVL:
 		vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
 			BOOK3S_HFLAG_NEW_TLBIE;
 		break;
@@ -1361,6 +1363,9 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_HIOR:
 		*val = get_reg_val(id, to_book3s(vcpu)->hior);
 		break;
+	case KVM_REG_PPC_VTB:
+		*val = get_reg_val(id, to_book3s(vcpu)->vtb);
+		break;
 	case KVM_REG_PPC_LPCR:
 	case KVM_REG_PPC_LPCR_64:
 		/*
@@ -1397,6 +1402,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 		to_book3s(vcpu)->hior = set_reg_val(id, *val);
 		to_book3s(vcpu)->hior_explicit = true;
 		break;
+	case KVM_REG_PPC_VTB:
+		to_book3s(vcpu)->vtb = set_reg_val(id, *val);
+		break;
 	case KVM_REG_PPC_LPCR:
 	case KVM_REG_PPC_LPCR_64:
 		kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 05aa11399a78..3bdc639157c1 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -99,6 +99,10 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 		return 0;
 	}
 
+	/* Record which CPU this arrived on for passed-through interrupts */
+	if (state->host_irq)
+		state->intr_cpu = raw_smp_processor_id();
+
 	/* Attempt delivery */
 	icp_deliver_irq(xics, NULL, irq);
 
@@ -812,7 +816,7 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	return H_SUCCESS;
 }
 
-static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 {
 	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
 	struct kvmppc_icp *icp = vcpu->arch.icp;
@@ -841,6 +845,7 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 
 	return H_SUCCESS;
 }
+EXPORT_SYMBOL_GPL(kvmppc_xics_rm_complete);
 
 int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 {
@@ -892,6 +897,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
 
 /* -- Initialisation code etc. -- */
 
+static void xics_debugfs_irqmap(struct seq_file *m,
+				struct kvmppc_passthru_irqmap *pimap)
+{
+	int i;
+
+	if (!pimap)
+		return;
+	seq_printf(m, "========\nPIRQ mappings: %d maps\n===========\n",
+				pimap->n_mapped);
+	for (i = 0; i < pimap->n_mapped; i++)  {
+		seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+			pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+	}
+}
+
 static int xics_debug_show(struct seq_file *m, void *private)
 {
 	struct kvmppc_xics *xics = m->private;
@@ -913,6 +933,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	t_check_resend = 0;
 	t_reject = 0;
 
+	xics_debugfs_irqmap(m, kvm->arch.pimap);
+
 	seq_printf(m, "=========\nICP state\n=========\n");
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -1252,6 +1274,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 {
 	struct kvmppc_xics *xics = kvm->arch.xics;
 
+	if (!xics)
+		return -ENODEV;
 	return ics_deliver_irq(xics, irq, level);
 }
 
@@ -1418,3 +1442,34 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
 {
 	return pin;
 }
+
+void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
+			    unsigned long host_irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	u16 idx;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return;
+
+	ics->irq_state[idx].host_irq = host_irq;
+	ics->irq_state[idx].intr_cpu = -1;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_set_mapped);
+
+void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long irq,
+			    unsigned long host_irq)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+	struct kvmppc_ics *ics;
+	u16 idx;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return;
+
+	ics->irq_state[idx].host_irq = 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xics_clr_mapped);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index a46b954055c4..2a50320b55ca 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -42,6 +42,8 @@ struct ics_irq_state {
 	u8  lsi;		/* level-sensitive interrupt */
 	u8  asserted; /* Only for LSI */
 	u8  exists;
+	int intr_cpu;
+	u32 host_irq;
 };
 
 /* Atomic ICP state, updated with a single compare & swap */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 02b4672f7347..df3f2706d3e5 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -2038,7 +2038,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		if (type == KVMPPC_DEBUG_NONE)
 			continue;
 
-		if (type & !(KVMPPC_DEBUG_WATCH_READ |
+		if (type & ~(KVMPPC_DEBUG_WATCH_READ |
 			     KVMPPC_DEBUG_WATCH_WRITE |
 			     KVMPPC_DEBUG_BREAKPOINT))
 			return -EINVAL;
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 29911a07bcdb..ddbf8f0284c0 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -743,7 +743,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	char *virt;
 	struct page **pages;
 	struct tlbe_priv *privs[2] = {};
-	u64 *g2h_bitmap = NULL;
+	u64 *g2h_bitmap;
 	size_t array_len;
 	u32 sets;
 	int num_pages, ret, i;
@@ -779,41 +779,44 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 
 	num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) -
 		    cfg->array / PAGE_SIZE;
-	pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL);
+	pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL);
 	if (!pages)
 		return -ENOMEM;
 
 	ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
 	if (ret < 0)
-		goto err_pages;
+		goto free_pages;
 
 	if (ret != num_pages) {
 		num_pages = ret;
 		ret = -EFAULT;
-		goto err_put_page;
+		goto put_pages;
 	}
 
 	virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
 	if (!virt) {
 		ret = -ENOMEM;
-		goto err_put_page;
+		goto put_pages;
 	}
 
-	privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
-			   GFP_KERNEL);
-	privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
-			   GFP_KERNEL);
+	privs[0] = kcalloc(params.tlb_sizes[0], sizeof(*privs[0]), GFP_KERNEL);
+	if (!privs[0]) {
+		ret = -ENOMEM;
+		goto put_pages;
+	}
 
-	if (!privs[0] || !privs[1]) {
+	privs[1] = kcalloc(params.tlb_sizes[1], sizeof(*privs[1]), GFP_KERNEL);
+	if (!privs[1]) {
 		ret = -ENOMEM;
-		goto err_privs;
+		goto free_privs_first;
 	}
 
-	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
-	                     GFP_KERNEL);
+	g2h_bitmap = kcalloc(params.tlb_sizes[1],
+			     sizeof(*g2h_bitmap),
+			     GFP_KERNEL);
 	if (!g2h_bitmap) {
 		ret = -ENOMEM;
-		goto err_privs;
+		goto free_privs_second;
 	}
 
 	free_gtlb(vcpu_e500);
@@ -845,16 +848,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 
 	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
-
-err_privs:
-	kfree(privs[0]);
+ free_privs_second:
 	kfree(privs[1]);
-
-err_put_page:
+ free_privs_first:
+	kfree(privs[0]);
+ put_pages:
 	for (i = 0; i < num_pages; i++)
 		put_page(pages[i]);
-
-err_pages:
+ free_pages:
 	kfree(pages);
 	return ret;
 }
@@ -904,11 +905,9 @@ static int vcpu_mmu_init(struct kvm_vcpu *vcpu,
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
-	int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
-	int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
 
 	if (e500_mmu_host_init(vcpu_e500))
-		goto err;
+		goto free_vcpu;
 
 	vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE;
 	vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE;
@@ -920,37 +919,39 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE;
 	vcpu_e500->gtlb_params[1].sets = 1;
 
-	vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL);
+	vcpu_e500->gtlb_arch = kmalloc_array(KVM_E500_TLB0_SIZE +
+					     KVM_E500_TLB1_SIZE,
+					     sizeof(*vcpu_e500->gtlb_arch),
+					     GFP_KERNEL);
 	if (!vcpu_e500->gtlb_arch)
 		return -ENOMEM;
 
 	vcpu_e500->gtlb_offset[0] = 0;
 	vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE;
 
-	vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) *
-					  vcpu_e500->gtlb_params[0].entries,
+	vcpu_e500->gtlb_priv[0] = kcalloc(vcpu_e500->gtlb_params[0].entries,
+					  sizeof(struct tlbe_ref),
 					  GFP_KERNEL);
 	if (!vcpu_e500->gtlb_priv[0])
-		goto err;
+		goto free_vcpu;
 
-	vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) *
-					  vcpu_e500->gtlb_params[1].entries,
+	vcpu_e500->gtlb_priv[1] = kcalloc(vcpu_e500->gtlb_params[1].entries,
+					  sizeof(struct tlbe_ref),
 					  GFP_KERNEL);
 	if (!vcpu_e500->gtlb_priv[1])
-		goto err;
+		goto free_vcpu;
 
-	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
-					  vcpu_e500->gtlb_params[1].entries,
+	vcpu_e500->g2h_tlb1_map = kcalloc(vcpu_e500->gtlb_params[1].entries,
+					  sizeof(*vcpu_e500->g2h_tlb1_map),
 					  GFP_KERNEL);
 	if (!vcpu_e500->g2h_tlb1_map)
-		goto err;
+		goto free_vcpu;
 
 	vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params);
 
 	kvmppc_recalc_tlb1map_range(vcpu_e500);
 	return 0;
-
-err:
+ free_vcpu:
 	free_gtlb(vcpu_e500);
 	return -1;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6ce40dd6fe51..70963c845e96 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,8 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/module.h>
+#include <linux/irqbypass.h>
+#include <linux/kvm_irqfd.h>
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
@@ -436,6 +438,16 @@ err_out:
 	return -EINVAL;
 }
 
+bool kvm_arch_has_vcpu_debugfs(void)
+{
+	return false;
+}
+
+int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	unsigned int i;
@@ -739,6 +751,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+	return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+		(kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+				     struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
+
+	if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+		return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+	return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+	struct kvm *kvm = irqfd->kvm;
+
+	if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+		kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
@@ -1167,6 +1215,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 	return r;
 }
 
+bool kvm_arch_intc_initialized(struct kvm *kvm)
+{
+#ifdef CONFIG_KVM_MPIC
+	if (kvm->arch.mpic)
+		return true;
+#endif
+#ifdef CONFIG_KVM_XICS
+	if (kvm->arch.xics)
+		return true;
+#endif
+	return false;
+}
+
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
 {
diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
index 33d9daff5783..fb21990c0fb4 100644
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
 		   __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
 );
 
+TRACE_EVENT(kvmppc_vcore_wakeup,
+	TP_PROTO(int do_sleep, __u64 ns),
+
+	TP_ARGS(do_sleep, ns),
+
+	TP_STRUCT__entry(
+		__field(__u64,  ns)
+		__field(int,    waited)
+		__field(pid_t,  tgid)
+	),
+
+	TP_fast_assign(
+		__entry->ns     = ns;
+		__entry->waited = do_sleep;
+		__entry->tgid   = current->tgid;
+	),
+
+	TP_printk("%s time %lld ns, tgid=%d",
+		__entry->waited ? "wait" : "poll",
+		__entry->ns, __entry->tgid)
+);
+
 TRACE_EVENT(kvmppc_run_vcpu_enter,
 	TP_PROTO(struct kvm_vcpu *vcpu),