From 1e8341ae0c0e117f0626cd6cf6732a0a9c8723f2 Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Sun, 12 May 2013 07:26:22 +0800
Subject: powerpc: Move the patch_exception to a common place

So that it can be used by other codes. No function change.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/code-patching.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index a6f8c7a5cbb7..97e02f985df8 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -34,6 +34,13 @@ int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr);
 unsigned long branch_target(const unsigned int *instr);
 unsigned int translate_branch(const unsigned int *dest,
 			      const unsigned int *src);
+#ifdef CONFIG_PPC_BOOK3E_64
+void __patch_exception(int exc, unsigned long addr);
+#define patch_exception(exc, name) do { \
+	extern unsigned int name; \
+	__patch_exception((exc), (unsigned long)&name); \
+} while (0)
+#endif
 
 static inline unsigned long ppc_function_entry(void *func)
 {
-- 
cgit v1.2.3


From 0ce636700c5bad54eda0e62903a1803f6d67b31d Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Thu, 22 Aug 2013 09:30:35 +0800
Subject: powerpc: purge all the prefetched instructions for the coherent
 icache flush

As Benjamin Herrenschmidt has indicated, we still need a dummy icbi to
purge all the prefetched instructions from the ifetch buffers for the
snooping icache. We also need a sync before the icbi to order the
actual stores to memory that might have modified instructions with
the icbi.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/cache.h | 14 +++++++++++++-
 arch/powerpc/kernel/misc_32.S    |  4 +++-
 arch/powerpc/kernel/misc_64.S    |  6 ++++++
 3 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 9e495c9a6a88..ed0afc1e44a4 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -41,8 +41,20 @@ struct ppc64_caches {
 extern struct ppc64_caches ppc64_caches;
 #endif /* __powerpc64__ && ! __ASSEMBLY__ */
 
-#if !defined(__ASSEMBLY__)
+#if defined(__ASSEMBLY__)
+/*
+ * For a snooping icache, we still need a dummy icbi to purge all the
+ * prefetched instructions from the ifetch buffers. We also need a sync
+ * before the icbi to order the the actual stores to memory that might
+ * have modified instructions with the icbi.
+ */
+#define PURGE_PREFETCHED_INS	\
+	sync;			\
+	icbi	0,r3;		\
+	sync;			\
+	isync
 
+#else
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 #ifdef CONFIG_6xx
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index e47d268727a4..879f09620f83 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -344,7 +344,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
  */
 _KPROBE(flush_icache_range)
 BEGIN_FTR_SECTION
-	isync
+	PURGE_PREFETCHED_INS
 	blr				/* for 601, do nothing */
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	li	r5,L1_CACHE_BYTES-1
@@ -448,6 +448,7 @@ _GLOBAL(invalidate_dcache_range)
  */
 _GLOBAL(__flush_dcache_icache)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	rlwinm	r3,r3,0,0,31-PAGE_SHIFT		/* Get page base address */
@@ -489,6 +490,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_44x)
  */
 _GLOBAL(__flush_dcache_icache_phys)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr					/* for 601, do nothing */
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	mfmsr	r10
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index e59caf874d05..a9f7a79a3a40 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -67,6 +67,7 @@ PPC64_CACHES:
 
 _KPROBE(flush_icache_range)
 BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
 	blr
 END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 /*
@@ -211,6 +212,11 @@ _GLOBAL(__flush_dcache_icache)
  * Different systems have different cache line sizes
  */
 
+BEGIN_FTR_SECTION
+	PURGE_PREFETCHED_INS
+	blr
+END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
+
 /* Flush the dcache */
  	ld	r7,PPC64_CACHES@toc(r2)
 	clrrdi	r3,r3,PAGE_SHIFT           	    /* Page align */
-- 
cgit v1.2.3


From c041cfa2af1ccb8d0346dc576144a1085e9b4d4b Mon Sep 17 00:00:00 2001
From: "fan.du" <fan.du@windriver.com>
Date: Wed, 23 Jan 2013 16:06:11 +0800
Subject: powerpc: Make irq_stat.timers_irqs counting more specific

Current irq_stat.timers_irqs counting doesn't discriminate timer event handler
and other timer interrupt(like arch_irq_work_raise). Sometimes we need to know
exactly how much interrupts timer event handler fired, so let's be more specific
on this.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/hardirq.h |  3 ++-
 arch/powerpc/kernel/irq.c          | 12 +++++++++---
 arch/powerpc/kernel/time.c         |  3 ++-
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h
index 3bdcfce2c42a..418fb654370d 100644
--- a/arch/powerpc/include/asm/hardirq.h
+++ b/arch/powerpc/include/asm/hardirq.h
@@ -6,7 +6,8 @@
 
 typedef struct {
 	unsigned int __softirq_pending;
-	unsigned int timer_irqs;
+	unsigned int timer_irqs_event;
+	unsigned int timer_irqs_others;
 	unsigned int pmu_irqs;
 	unsigned int mce_exceptions;
 	unsigned int spurious_irqs;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ba0165615215..9729b23bfb0a 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -354,8 +354,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 
 	seq_printf(p, "%*s: ", prec, "LOC");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs);
-        seq_printf(p, "  Local timer interrupts\n");
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_event);
+        seq_printf(p, "  Local timer interrupts for timer event device\n");
+
+	seq_printf(p, "%*s: ", prec, "LOC");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", per_cpu(irq_stat, j).timer_irqs_others);
+        seq_printf(p, "  Local timer interrupts for others\n");
 
 	seq_printf(p, "%*s: ", prec, "SPU");
 	for_each_online_cpu(j)
@@ -389,11 +394,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
  */
 u64 arch_irq_stat_cpu(unsigned int cpu)
 {
-	u64 sum = per_cpu(irq_stat, cpu).timer_irqs;
+	u64 sum = per_cpu(irq_stat, cpu).timer_irqs_event;
 
 	sum += per_cpu(irq_stat, cpu).pmu_irqs;
 	sum += per_cpu(irq_stat, cpu).mce_exceptions;
 	sum += per_cpu(irq_stat, cpu).spurious_irqs;
+	sum += per_cpu(irq_stat, cpu).timer_irqs_others;
 #ifdef CONFIG_PPC_DOORBELL
 	sum += per_cpu(irq_stat, cpu).doorbell_irqs;
 #endif
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index b3b144121cc9..afb1b56ef4fa 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -510,7 +510,6 @@ void timer_interrupt(struct pt_regs * regs)
 	 */
 	may_hard_irq_enable();
 
-	__get_cpu_var(irq_stat).timer_irqs++;
 
 #if defined(CONFIG_PPC32) && defined(CONFIG_PMAC)
 	if (atomic_read(&ppc_n_lost_interrupts) != 0)
@@ -532,10 +531,12 @@ void timer_interrupt(struct pt_regs * regs)
 		*next_tb = ~(u64)0;
 		if (evt->event_handler)
 			evt->event_handler(evt);
+		__get_cpu_var(irq_stat).timer_irqs_event++;
 	} else {
 		now = *next_tb - now;
 		if (now <= DECREMENTER_MAX)
 			set_dec((int)now);
+		__get_cpu_var(irq_stat).timer_irqs_others++;
 	}
 
 #ifdef CONFIG_PPC64
-- 
cgit v1.2.3


From b14a7253cf999412e5a0dd39d58b0a42d19fd73a Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:03:51 +0530
Subject: powerpc/book3s: Split the common exception prolog logic into two
 section.

This patch splits the common exception prolog logic into three parts to
facilitate reuse of existing code in the next patch. This patch also
re-arranges few instructions in such a way that the second part now deals
with saving register values from paca save area to stack frame, and
the third part deals with saving current register values to stack frame.

The second and third part will be reused in the machine check exception
routine in the subsequent patch.

Please note that this patch does not introduce or change existing code
logic. Instead it is just a code movement and instruction re-ordering.

Patch Acked-by Paul. But made some minor modification (explained above) to
address Paul's comment in the later patch(3).

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/exception-64s.h | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 894662a5d4d5..ff4e2e80856d 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -301,9 +301,12 @@ do_kvm_##n:								\
 	beq	4f;			/* if from kernel mode		*/ \
 	ACCOUNT_CPU_USER_ENTRY(r9, r10);				   \
 	SAVE_PPR(area, r9, r10);					   \
-4:	std	r2,GPR2(r1);		/* save r2 in stackframe	*/ \
-	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe	*/ \
-	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe	*/ \
+4:	EXCEPTION_PROLOG_COMMON_2(area)					   \
+	EXCEPTION_PROLOG_COMMON_3(n)					   \
+	ACCOUNT_STOLEN_TIME
+
+/* Save original regs values from save area to stack frame. */
+#define EXCEPTION_PROLOG_COMMON_2(area)					   \
 	ld	r9,area+EX_R9(r13);	/* move r9, r10 to stackframe	*/ \
 	ld	r10,area+EX_R10(r13);					   \
 	std	r9,GPR9(r1);						   \
@@ -318,11 +321,16 @@ do_kvm_##n:								\
 	ld	r10,area+EX_CFAR(r13);					   \
 	std	r10,ORIG_GPR3(r1);					   \
 	END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);		   \
+	GET_CTR(r10, area);						   \
+	std	r10,_CTR(r1);
+
+#define EXCEPTION_PROLOG_COMMON_3(n)					   \
+	std	r2,GPR2(r1);		/* save r2 in stackframe	*/ \
+	SAVE_4GPRS(3, r1);		/* save r3 - r6 in stackframe   */ \
+	SAVE_2GPRS(7, r1);		/* save r7, r8 in stackframe	*/ \
 	mflr	r9;			/* Get LR, later save to stack	*/ \
 	ld	r2,PACATOC(r13);	/* get kernel TOC into r2	*/ \
 	std	r9,_LINK(r1);						   \
-	GET_CTR(r10, area);						   \
-	std	r10,_CTR(r1);						   \
 	lbz	r10,PACASOFTIRQEN(r13);				   \
 	mfspr	r11,SPRN_XER;		/* save XER in stackframe	*/ \
 	std	r10,SOFTE(r1);						   \
@@ -332,8 +340,7 @@ do_kvm_##n:								\
 	li	r10,0;							   \
 	ld	r11,exception_marker@toc(r2);				   \
 	std	r10,RESULT(r1);		/* clear regs->result		*/ \
-	std	r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame	*/ \
-	ACCOUNT_STOLEN_TIME
+	std	r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame	*/
 
 /*
  * Exception vectors.
-- 
cgit v1.2.3


From 729b0f715371ce1e7636b4958fc45d6882442456 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:04:00 +0530
Subject: powerpc/book3s: Introduce exclusive emergency stack for machine check
 exception.

This patch introduces exclusive emergency stack for machine check exception.
We use emergency stack to handle machine check exception so that we can save
MCE information (srr1, srr0, dar and dsisr) before turning on ME bit and be
ready for re-entrancy. This helps us to prevent clobbering of MCE information
in case of nested machine checks.

The reason for using emergency stack over normal kernel stack is that the
machine check might occur in the middle of setting up a stack frame which may
result into improper use of kernel stack.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/paca.h |  9 +++++++++
 arch/powerpc/kernel/setup_64.c  | 10 +++++++++-
 arch/powerpc/xmon/xmon.c        |  4 ++++
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index b6ea9e068c13..c3523d1dda58 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -152,6 +152,15 @@ struct paca_struct {
 	 */
 	struct opal_machine_check_event *opal_mc_evt;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	/* Exclusive emergency stack pointer for machine check exception. */
+	void *mc_emergency_sp;
+	/*
+	 * Flag to check whether we are in machine check early handler
+	 * and already using emergency stack.
+	 */
+	u16 in_mce;
+#endif
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 5760b9bea936..2232aff66059 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -540,7 +540,8 @@ static void __init exc_lvl_early_init(void)
 
 /*
  * Stack space used when we detect a bad kernel stack pointer, and
- * early in SMP boots before relocation is enabled.
+ * early in SMP boots before relocation is enabled. Exclusive emergency
+ * stack for machine checks.
  */
 static void __init emergency_stack_init(void)
 {
@@ -563,6 +564,13 @@ static void __init emergency_stack_init(void)
 		sp  = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
 		sp += THREAD_SIZE;
 		paca[i].emergency_sp = __va(sp);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+		/* emergency stack for machine check exception handling. */
+		sp  = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
+		sp += THREAD_SIZE;
+		paca[i].mc_emergency_sp = __va(sp);
+#endif
 	}
 }
 
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index af9d3469fb99..a90731b3d44a 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2051,6 +2051,10 @@ static void dump_one_paca(int cpu)
 	DUMP(p, stab_addr, "lx");
 #endif
 	DUMP(p, emergency_sp, "p");
+#ifdef CONFIG_PPC_BOOK3S_64
+	DUMP(p, mc_emergency_sp, "p");
+	DUMP(p, in_mce, "x");
+#endif
 	DUMP(p, data_offset, "lx");
 	DUMP(p, hw_cpu_id, "x");
 	DUMP(p, cpu_start, "x");
-- 
cgit v1.2.3


From 4c703416efc0a23f83a282b9240bb92fbd9e0be9 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:04:40 +0530
Subject: powerpc/book3s: Introduce a early machine check hook in cpu_spec.

This patch adds the early machine check function pointer in cputable for
CPU specific early machine check handling. The early machine handle routine
will be called in real mode to handle SLB and TLB errors. We can not reuse
the existing machine_check hook because it is always invoked in kernel
virtual mode and we would already be in trouble if we get SLB or TLB errors.
This patch just sets up a mechanism to invoke CPU specific handler. The
subsequent patches will populate the function pointer.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/cputable.h | 7 +++++++
 arch/powerpc/kernel/traps.c         | 7 +++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 0d4939ba48e7..c5b107afca4d 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -90,6 +90,13 @@ struct cpu_spec {
 	 * if the error is fatal, 1 if it was fully recovered and 0 to
 	 * pass up (not CPU originated) */
 	int		(*machine_check)(struct pt_regs *regs);
+
+	/*
+	 * Processor specific early machine check handler which is
+	 * called in real mode to handle SLB and TLB errors.
+	 */
+	long		(*machine_check_early)(struct pt_regs *regs);
+
 };
 
 extern struct cpu_spec		*cur_cpu_spec;
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 97a1ce72e130..330841766b09 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -293,8 +293,11 @@ void system_reset_exception(struct pt_regs *regs)
  */
 long machine_check_early(struct pt_regs *regs)
 {
-	/* TODO: handle/decode machine check reason */
-	return 0;
+	long handled = 0;
+
+	if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
+		handled = cur_cpu_spec->machine_check_early(regs);
+	return handled;
 }
 
 #endif
-- 
cgit v1.2.3


From 0440705049b041d84268ea57f6e90e2f16618897 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:04:56 +0530
Subject: powerpc/book3s: Add flush_tlb operation in cpu_spec.

This patch introduces flush_tlb operation in cpu_spec structure. This will
help us to invoke appropriate CPU-side flush tlb routine. This patch
adds the foundation to invoke CPU specific flush routine for respective
architectures. Currently this patch introduce flush_tlb for p7 and p8.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/cputable.h   |  5 +++++
 arch/powerpc/kernel/cpu_setup_power.S | 38 +++++++++++++++++++++++++----------
 arch/powerpc/kernel/cputable.c        |  8 ++++++++
 arch/powerpc/kvm/book3s_hv_ras.c      | 18 ++++-------------
 4 files changed, 44 insertions(+), 25 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index c5b107afca4d..617cc767c076 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -97,6 +97,11 @@ struct cpu_spec {
 	 */
 	long		(*machine_check_early)(struct pt_regs *regs);
 
+	/*
+	 * Processor specific routine to flush tlbs.
+	 */
+	void		(*flush_tlb)(unsigned long inval_selector);
+
 };
 
 extern struct cpu_spec		*cur_cpu_spec;
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 18b5b9cf8e37..37d1bb002aa9 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -29,7 +29,7 @@ _GLOBAL(__setup_cpu_power7)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	bl	__init_LPCR
-	bl	__init_TLB
+	bl	__init_tlb_power7
 	mtlr	r11
 	blr
 
@@ -42,7 +42,7 @@ _GLOBAL(__restore_cpu_power7)
 	mtspr	SPRN_LPID,r0
 	mfspr	r3,SPRN_LPCR
 	bl	__init_LPCR
-	bl	__init_TLB
+	bl	__init_tlb_power7
 	mtlr	r11
 	blr
 
@@ -59,7 +59,7 @@ _GLOBAL(__setup_cpu_power8)
 	oris	r3, r3, LPCR_AIL_3@h
 	bl	__init_LPCR
 	bl	__init_HFSCR
-	bl	__init_TLB
+	bl	__init_tlb_power8
 	bl	__init_PMU_HV
 	mtlr	r11
 	blr
@@ -78,7 +78,7 @@ _GLOBAL(__restore_cpu_power8)
 	oris	r3, r3, LPCR_AIL_3@h
 	bl	__init_LPCR
 	bl	__init_HFSCR
-	bl	__init_TLB
+	bl	__init_tlb_power8
 	bl	__init_PMU_HV
 	mtlr	r11
 	blr
@@ -134,15 +134,31 @@ __init_HFSCR:
 	mtspr	SPRN_HFSCR,r3
 	blr
 
-__init_TLB:
-	/*
-	 * Clear the TLB using the "IS 3" form of tlbiel instruction
-	 * (invalidate by congruence class). P7 has 128 CCs, P8 has 512
-	 * so we just always do 512
-	 */
+/*
+ * Clear the TLB using the specified IS form of tlbiel instruction
+ * (invalidate by congruence class). P7 has 128 CCs., P8 has 512.
+ *
+ * r3 = IS field
+ */
+__init_tlb_power7:
+	li	r3,0xc00	/* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power7)
+	li	r6,128
+	mtctr	r6
+	mr	r7,r3		/* IS field */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:	blr
+
+__init_tlb_power8:
+	li	r3,0xc00	/* IS field = 0b11 */
+_GLOBAL(__flush_tlb_power8)
 	li	r6,512
 	mtctr	r6
-	li	r7,0xc00	/* IS field = 0b11 */
+	mr	r7,r3		/* IS field */
 	ptesync
 2:	tlbiel	r7
 	addi	r7,r7,0x1000
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 597d954e5860..55d0f9c282b8 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -71,6 +71,8 @@ extern void __restore_cpu_power7(void);
 extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power8(void);
 extern void __restore_cpu_a2(void);
+extern void __flush_tlb_power7(unsigned long inval_selector);
+extern void __flush_tlb_power8(unsigned long inval_selector);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
 extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
@@ -440,6 +442,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
 		.platform		= "power7",
 	},
 	{	/* 2.07-compliant processor, i.e. Power8 "architected" mode */
@@ -456,6 +459,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_cpu_type	= "ppc64/ibm-compat-v1",
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
 		.platform		= "power8",
 	},
 	{	/* Power7 */
@@ -474,6 +478,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_POWER4,
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
 		.platform		= "power7",
 	},
 	{	/* Power7+ */
@@ -492,6 +497,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_POWER4,
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
+		.flush_tlb		= __flush_tlb_power7,
 		.platform		= "power7+",
 	},
 	{	/* Power8E */
@@ -510,6 +516,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_INVALID,
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
 		.platform		= "power8",
 	},
 	{	/* Power8 */
@@ -528,6 +535,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.oprofile_type		= PPC_OPROFILE_INVALID,
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
+		.flush_tlb		= __flush_tlb_power8,
 		.platform		= "power8",
 	},
 	{	/* Cell Broadband Engine */
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index a353c485808c..5c427b41a2f5 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -58,18 +58,6 @@ static void reload_slb(struct kvm_vcpu *vcpu)
 	}
 }
 
-/* POWER7 TLB flush */
-static void flush_tlb_power7(struct kvm_vcpu *vcpu)
-{
-	unsigned long i, rb;
-
-	rb = TLBIEL_INVAL_SET_LPID;
-	for (i = 0; i < POWER7_TLB_SETS; ++i) {
-		asm volatile("tlbiel %0" : : "r" (rb));
-		rb += 1 << TLBIEL_INVAL_SET_SHIFT;
-	}
-}
-
 /*
  * On POWER7, see if we can handle a machine check that occurred inside
  * the guest in real mode, without switching to the host partition.
@@ -96,7 +84,8 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
 				   DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI);
 		}
 		if (dsisr & DSISR_MC_TLB_MULTI) {
-			flush_tlb_power7(vcpu);
+			if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+				cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID);
 			dsisr &= ~DSISR_MC_TLB_MULTI;
 		}
 		/* Any other errors we don't understand? */
@@ -113,7 +102,8 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
 		reload_slb(vcpu);
 		break;
 	case SRR1_MC_IFETCH_TLBMULTI:
-		flush_tlb_power7(vcpu);
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_SET_LPID);
 		break;
 	default:
 		handled = 0;
-- 
cgit v1.2.3


From e22a22740c1ac23aaa10835f026b3549ee3e4e75 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:05:11 +0530
Subject: powerpc/book3s: Flush SLB/TLBs if we get SLB/TLB machine check errors
 on power7.

If we get a machine check exception due to SLB or TLB errors, then flush
SLBs/TLBs and reload SLBs to recover. We do this in real mode before turning
on MMU. Otherwise we would run into nested machine checks.

If we get a machine check when we are in guest, then just flush the
SLBs and continue. This patch handles errors for power7. The next
patch will handle errors for power8

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/bitops.h |   5 ++
 arch/powerpc/include/asm/mce.h    |  67 +++++++++++++++++
 arch/powerpc/kernel/Makefile      |   1 +
 arch/powerpc/kernel/cputable.c    |   4 +
 arch/powerpc/kernel/mce_power.c   | 150 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 227 insertions(+)
 create mode 100644 arch/powerpc/include/asm/mce.h
 create mode 100644 arch/powerpc/kernel/mce_power.c

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 910194e9a1e2..a5e9a7d494d8 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -46,6 +46,11 @@
 #include <asm/asm-compat.h>
 #include <asm/synch.h>
 
+/* PPC bit number conversion */
+#define PPC_BITLSHIFT(be)	(BITS_PER_LONG - 1 - (be))
+#define PPC_BIT(bit)		(1UL << PPC_BITLSHIFT(bit))
+#define PPC_BITMASK(bs, be)	((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs))
+
 /*
  * clear_bit doesn't imply a memory barrier
  */
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
new file mode 100644
index 000000000000..8157d4eaead6
--- /dev/null
+++ b/arch/powerpc/include/asm/mce.h
@@ -0,0 +1,67 @@
+/*
+ * Machine check exception header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __ASM_PPC64_MCE_H__
+#define __ASM_PPC64_MCE_H__
+
+#include <linux/bitops.h>
+
+/*
+ * Machine Check bits on power7 and power8
+ */
+#define P7_SRR1_MC_LOADSTORE(srr1)	((srr1) & PPC_BIT(42)) /* P8 too */
+
+/* SRR1 bits for machine check (On Power7 and Power8) */
+#define P7_SRR1_MC_IFETCH(srr1)	((srr1) & PPC_BITMASK(43, 45)) /* P8 too */
+
+#define P7_SRR1_MC_IFETCH_UE		(0x1 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_SLB_PARITY	(0x2 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT	(0x3 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_SLB_BOTH	(0x4 << PPC_BITLSHIFT(45))
+#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT	(0x5 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD	(0x6 << PPC_BITLSHIFT(45)) /* P8 too */
+#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL	(0x7 << PPC_BITLSHIFT(45))
+
+/* SRR1 bits for machine check (On Power8) */
+#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT	(0x4 << PPC_BITLSHIFT(45))
+
+/* DSISR bits for machine check (On Power7 and Power8) */
+#define P7_DSISR_MC_UE			(PPC_BIT(48))	/* P8 too */
+#define P7_DSISR_MC_UE_TABLEWALK	(PPC_BIT(49))	/* P8 too */
+#define P7_DSISR_MC_ERAT_MULTIHIT	(PPC_BIT(52))	/* P8 too */
+#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB	(PPC_BIT(53))	/* P8 too */
+#define P7_DSISR_MC_SLB_PARITY_MFSLB	(PPC_BIT(55))	/* P8 too */
+#define P7_DSISR_MC_SLB_MULTIHIT	(PPC_BIT(56))	/* P8 too */
+#define P7_DSISR_MC_SLB_MULTIHIT_PARITY	(PPC_BIT(57))	/* P8 too */
+
+/*
+ * DSISR bits for machine check (Power8) in addition to above.
+ * Secondary DERAT Multihit
+ */
+#define P8_DSISR_MC_ERAT_MULTIHIT_SEC	(PPC_BIT(54))
+
+/* SLB error bits */
+#define P7_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_ERAT_MULTIHIT | \
+					 P7_DSISR_MC_SLB_PARITY_MFSLB | \
+					 P7_DSISR_MC_SLB_MULTIHIT | \
+					 P7_DSISR_MC_SLB_MULTIHIT_PARITY)
+
+#endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 445cb6e39d5b..07c63d0aa759 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= mce_power.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC_A2)		+= cpu_setup_a2.o
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 55d0f9c282b8..c54188bcdd9e 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -73,6 +73,7 @@ extern void __restore_cpu_power8(void);
 extern void __restore_cpu_a2(void);
 extern void __flush_tlb_power7(unsigned long inval_selector);
 extern void __flush_tlb_power8(unsigned long inval_selector);
+extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
 extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
@@ -443,6 +444,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
 		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7",
 	},
 	{	/* 2.07-compliant processor, i.e. Power8 "architected" mode */
@@ -479,6 +481,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
 		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7",
 	},
 	{	/* Power7+ */
@@ -498,6 +501,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_setup		= __setup_cpu_power7,
 		.cpu_restore		= __restore_cpu_power7,
 		.flush_tlb		= __flush_tlb_power7,
+		.machine_check_early	= __machine_check_early_realmode_p7,
 		.platform		= "power7+",
 	},
 	{	/* Power8E */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
new file mode 100644
index 000000000000..690547319b03
--- /dev/null
+++ b/arch/powerpc/kernel/mce_power.c
@@ -0,0 +1,150 @@
+/*
+ * Machine check exception handling CPU-side for power7 and power8
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce_power: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <asm/mmu.h>
+#include <asm/mce.h>
+
+/* flush SLBs and reload */
+static void flush_and_reload_slb(void)
+{
+	struct slb_shadow *slb;
+	unsigned long i, n;
+
+	/* Invalidate all SLBs */
+	asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+#ifdef CONFIG_KVM_BOOK3S_HANDLER
+	/*
+	 * If machine check is hit when in guest or in transition, we will
+	 * only flush the SLBs and continue.
+	 */
+	if (get_paca()->kvm_hstate.in_guest)
+		return;
+#endif
+
+	/* For host kernel, reload the SLBs from shadow SLB buffer. */
+	slb = get_slb_shadow();
+	if (!slb)
+		return;
+
+	n = min_t(u32, slb->persistent, SLB_MIN_SIZE);
+
+	/* Load up the SLB entries from shadow SLB */
+	for (i = 0; i < n; i++) {
+		unsigned long rb = slb->save_area[i].esid;
+		unsigned long rs = slb->save_area[i].vsid;
+
+		rb = (rb & ~0xFFFul) | i;
+		asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+	}
+}
+
+static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
+{
+	long handled = 1;
+
+	/*
+	 * flush and reload SLBs for SLB errors and flush TLBs for TLB errors.
+	 * reset the error bits whenever we handle them so that at the end
+	 * we can check whether we handled all of them or not.
+	 * */
+	if (dsisr & slb_error_bits) {
+		flush_and_reload_slb();
+		/* reset error bits */
+		dsisr &= ~(slb_error_bits);
+	}
+	if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
+			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+		/* reset error bits */
+		dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
+	}
+	/* Any other errors we don't understand? */
+	if (dsisr & 0xffffffffUL)
+		handled = 0;
+
+	return handled;
+}
+
+static long mce_handle_derror_p7(uint64_t dsisr)
+{
+	return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
+}
+
+static long mce_handle_common_ierror(uint64_t srr1)
+{
+	long handled = 0;
+
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case 0:
+		break;
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+		/* flush and reload SLBs for SLB errors. */
+		flush_and_reload_slb();
+		handled = 1;
+		break;
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
+			cur_cpu_spec->flush_tlb(TLBIEL_INVAL_PAGE);
+			handled = 1;
+		}
+		break;
+	default:
+		break;
+	}
+
+	return handled;
+}
+
+static long mce_handle_ierror_p7(uint64_t srr1)
+{
+	long handled = 0;
+
+	handled = mce_handle_common_ierror(srr1);
+
+	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+		flush_and_reload_slb();
+		handled = 1;
+	}
+	return handled;
+}
+
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
+{
+	uint64_t srr1;
+	long handled = 1;
+
+	srr1 = regs->msr;
+
+	if (P7_SRR1_MC_LOADSTORE(srr1))
+		handled = mce_handle_derror_p7(regs->dsisr);
+	else
+		handled = mce_handle_ierror_p7(srr1);
+
+	/* TODO: Decode machine check reason. */
+	return handled;
+}
-- 
cgit v1.2.3


From ae744f3432d3872c51298d922728e13c24ccc068 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:05:26 +0530
Subject: powerpc/book3s: Flush SLB/TLBs if we get SLB/TLB machine check errors
 on power8.

This patch handles the memory errors on power8. If we get a machine check
exception due to SLB or TLB errors, then flush SLBs/TLBs and reload SLBs to
recover.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mce.h  |  3 +++
 arch/powerpc/kernel/cputable.c  |  4 ++++
 arch/powerpc/kernel/mce_power.c | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 8157d4eaead6..e3ffa825b970 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -64,4 +64,7 @@
 					 P7_DSISR_MC_SLB_MULTIHIT | \
 					 P7_DSISR_MC_SLB_MULTIHIT_PARITY)
 
+#define P8_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_SLB_ERRORS | \
+					 P8_DSISR_MC_ERAT_MULTIHIT_SEC)
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index c54188bcdd9e..6c8dd5da4de5 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -74,6 +74,7 @@ extern void __restore_cpu_a2(void);
 extern void __flush_tlb_power7(unsigned long inval_selector);
 extern void __flush_tlb_power8(unsigned long inval_selector);
 extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
+extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
 #endif /* CONFIG_PPC64 */
 #if defined(CONFIG_E500)
 extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec);
@@ -462,6 +463,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
 		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
 	{	/* Power7 */
@@ -521,6 +523,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
 		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
 	{	/* Power8 */
@@ -540,6 +543,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.cpu_setup		= __setup_cpu_power8,
 		.cpu_restore		= __restore_cpu_power8,
 		.flush_tlb		= __flush_tlb_power8,
+		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
 	{	/* Cell Broadband Engine */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 690547319b03..60a217f11275 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -148,3 +148,37 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs)
 	/* TODO: Decode machine check reason. */
 	return handled;
 }
+
+static long mce_handle_ierror_p8(uint64_t srr1)
+{
+	long handled = 0;
+
+	handled = mce_handle_common_ierror(srr1);
+
+	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+		flush_and_reload_slb();
+		handled = 1;
+	}
+	return handled;
+}
+
+static long mce_handle_derror_p8(uint64_t dsisr)
+{
+	return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
+}
+
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
+{
+	uint64_t srr1;
+	long handled = 1;
+
+	srr1 = regs->msr;
+
+	if (P7_SRR1_MC_LOADSTORE(srr1))
+		handled = mce_handle_derror_p8(regs->dsisr);
+	else
+		handled = mce_handle_ierror_p8(srr1);
+
+	/* TODO: Decode machine check reason. */
+	return handled;
+}
-- 
cgit v1.2.3


From 36df96f8acaf51992177645eb2d781f766ce97dc Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:05:40 +0530
Subject: powerpc/book3s: Decode and save machine check event.

Now that we handle machine check in linux, the MCE decoding should also
take place in linux host. This info is crucial to log before we go down
in case we can not handle the machine check errors. This patch decodes
and populates a machine check event which contain high level meaning full
MCE information.

We do this in real mode C code with ME bit on. The MCE information is still
available on emergency stack (in pt_regs structure format). Even if we take
another exception at this point the MCE early handler will allocate a new
stack frame on top of current one. So when we return back here we still have
our MCE information safe on current stack.

We use per cpu buffer to save high level MCE information. Each per cpu buffer
is an array of machine check event structure indexed by per cpu counter
mce_nest_count. The mce_nest_count is incremented every time we enter
machine check early handler in real mode to get the current free slot
(index = mce_nest_count - 1). The mce_nest_count is decremented once the
MCE info is consumed by virtual mode machine exception handler.

This patch provides save_mce_event(), get_mce_event() and release_mce_event()
generic routines that can be used by machine check handlers to populate and
retrieve the event. The routine release_mce_event() will free the event slot so
that it can be reused. Caller can invoke get_mce_event() with a release flag
either to release the event slot immediately OR keep it so that it can be
fetched again. The event slot can be also released anytime by invoking
release_mce_event().

This patch also updates kvm code to invoke get_mce_event to retrieve generic
mce event rather than paca->opal_mce_evt.

The KVM code always calls get_mce_event() with release flags set to false so
that event is available for linus host machine

If machine check occurs while we are in guest, KVM tries to handle the error.
If KVM is able to handle MC error successfully, it enters the guest and
delivers the machine check to guest. If KVM is not able to handle MC error, it
exists the guest and passes the control to linux host machine check handler
which then logs MC event and decides how to handle it in linux host. In failure
case, KVM needs to make sure that the MC event is available for linux host to
consume. Hence KVM always calls get_mce_event() with release flags set to false
and later it invokes release_mce_event() only if it succeeds to handle error.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mce.h        | 124 +++++++++++++++++++++++++
 arch/powerpc/kernel/Makefile          |   2 +-
 arch/powerpc/kernel/mce.c             | 164 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/mce_power.c       | 116 ++++++++++++++++++++++--
 arch/powerpc/kvm/book3s_hv_ras.c      |  32 ++++---
 arch/powerpc/platforms/powernv/opal.c |  35 ++++----
 6 files changed, 434 insertions(+), 39 deletions(-)
 create mode 100644 arch/powerpc/kernel/mce.c

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index e3ffa825b970..87cad2a808c2 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -66,5 +66,129 @@
 
 #define P8_DSISR_MC_SLB_ERRORS		(P7_DSISR_MC_SLB_ERRORS | \
 					 P8_DSISR_MC_ERAT_MULTIHIT_SEC)
+enum MCE_Version {
+	MCE_V1 = 1,
+};
+
+enum MCE_Severity {
+	MCE_SEV_NO_ERROR = 0,
+	MCE_SEV_WARNING = 1,
+	MCE_SEV_ERROR_SYNC = 2,
+	MCE_SEV_FATAL = 3,
+};
+
+enum MCE_Disposition {
+	MCE_DISPOSITION_RECOVERED = 0,
+	MCE_DISPOSITION_NOT_RECOVERED = 1,
+};
+
+enum MCE_Initiator {
+	MCE_INITIATOR_UNKNOWN = 0,
+	MCE_INITIATOR_CPU = 1,
+};
+
+enum MCE_ErrorType {
+	MCE_ERROR_TYPE_UNKNOWN = 0,
+	MCE_ERROR_TYPE_UE = 1,
+	MCE_ERROR_TYPE_SLB = 2,
+	MCE_ERROR_TYPE_ERAT = 3,
+	MCE_ERROR_TYPE_TLB = 4,
+};
+
+enum MCE_UeErrorType {
+	MCE_UE_ERROR_INDETERMINATE = 0,
+	MCE_UE_ERROR_IFETCH = 1,
+	MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH = 2,
+	MCE_UE_ERROR_LOAD_STORE = 3,
+	MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 4,
+};
+
+enum MCE_SlbErrorType {
+	MCE_SLB_ERROR_INDETERMINATE = 0,
+	MCE_SLB_ERROR_PARITY = 1,
+	MCE_SLB_ERROR_MULTIHIT = 2,
+};
+
+enum MCE_EratErrorType {
+	MCE_ERAT_ERROR_INDETERMINATE = 0,
+	MCE_ERAT_ERROR_PARITY = 1,
+	MCE_ERAT_ERROR_MULTIHIT = 2,
+};
+
+enum MCE_TlbErrorType {
+	MCE_TLB_ERROR_INDETERMINATE = 0,
+	MCE_TLB_ERROR_PARITY = 1,
+	MCE_TLB_ERROR_MULTIHIT = 2,
+};
+
+struct machine_check_event {
+	enum MCE_Version	version:8;	/* 0x00 */
+	uint8_t			in_use;		/* 0x01 */
+	enum MCE_Severity	severity:8;	/* 0x02 */
+	enum MCE_Initiator	initiator:8;	/* 0x03 */
+	enum MCE_ErrorType	error_type:8;	/* 0x04 */
+	enum MCE_Disposition	disposition:8;	/* 0x05 */
+	uint8_t			reserved_1[2];	/* 0x06 */
+	uint64_t		gpr3;		/* 0x08 */
+	uint64_t		srr0;		/* 0x10 */
+	uint64_t		srr1;		/* 0x18 */
+	union {					/* 0x20 */
+		struct {
+			enum MCE_UeErrorType ue_error_type:8;
+			uint8_t		effective_address_provided;
+			uint8_t		physical_address_provided;
+			uint8_t		reserved_1[5];
+			uint64_t	effective_address;
+			uint64_t	physical_address;
+			uint8_t		reserved_2[8];
+		} ue_error;
+
+		struct {
+			enum MCE_SlbErrorType slb_error_type:8;
+			uint8_t		effective_address_provided;
+			uint8_t		reserved_1[6];
+			uint64_t	effective_address;
+			uint8_t		reserved_2[16];
+		} slb_error;
+
+		struct {
+			enum MCE_EratErrorType erat_error_type:8;
+			uint8_t		effective_address_provided;
+			uint8_t		reserved_1[6];
+			uint64_t	effective_address;
+			uint8_t		reserved_2[16];
+		} erat_error;
+
+		struct {
+			enum MCE_TlbErrorType tlb_error_type:8;
+			uint8_t		effective_address_provided;
+			uint8_t		reserved_1[6];
+			uint64_t	effective_address;
+			uint8_t		reserved_2[16];
+		} tlb_error;
+	} u;
+};
+
+struct mce_error_info {
+	enum MCE_ErrorType error_type:8;
+	union {
+		enum MCE_UeErrorType ue_error_type:8;
+		enum MCE_SlbErrorType slb_error_type:8;
+		enum MCE_EratErrorType erat_error_type:8;
+		enum MCE_TlbErrorType tlb_error_type:8;
+	} u;
+	uint8_t		reserved[2];
+};
+
+#define MAX_MC_EVT	100
+
+/* Release flags for get_mce_event() */
+#define MCE_EVENT_RELEASE	true
+#define MCE_EVENT_DONTRELEASE	false
+
+extern void save_mce_event(struct pt_regs *regs, long handled,
+			   struct mce_error_info *mce_err, uint64_t addr);
+extern int get_mce_event(struct machine_check_event *mce, bool release);
+extern void release_mce_event(void);
 
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 07c63d0aa759..904d713366ff 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64)	+= mce_power.o
+obj-$(CONFIG_PPC_BOOK3S_64)	+= mce.o mce_power.o
 obj64-$(CONFIG_RELOCATABLE)	+= reloc_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)	+= exceptions-64e.o idle_book3e.o
 obj-$(CONFIG_PPC_A2)		+= cpu_setup_a2.o
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
new file mode 100644
index 000000000000..aeecdf1ba897
--- /dev/null
+++ b/arch/powerpc/kernel/mce.c
@@ -0,0 +1,164 @@
+/*
+ * Machine check exception handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+#define pr_fmt(fmt) "mce: " fmt
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/export.h>
+#include <asm/mce.h>
+
+static DEFINE_PER_CPU(int, mce_nest_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
+
+static void mce_set_error_info(struct machine_check_event *mce,
+			       struct mce_error_info *mce_err)
+{
+	mce->error_type = mce_err->error_type;
+	switch (mce_err->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
+		break;
+	case MCE_ERROR_TYPE_UNKNOWN:
+	default:
+		break;
+	}
+}
+
+/*
+ * Decode and save high level MCE information into per cpu buffer which
+ * is an array of machine_check_event structure.
+ */
+void save_mce_event(struct pt_regs *regs, long handled,
+		    struct mce_error_info *mce_err,
+		    uint64_t addr)
+{
+	uint64_t srr1;
+	int index = __get_cpu_var(mce_nest_count)++;
+	struct machine_check_event *mce = &__get_cpu_var(mce_event[index]);
+
+	/*
+	 * Return if we don't have enough space to log mce event.
+	 * mce_nest_count may go beyond MAX_MC_EVT but that's ok,
+	 * the check below will stop buffer overrun.
+	 */
+	if (index >= MAX_MC_EVT)
+		return;
+
+	/* Populate generic machine check info */
+	mce->version = MCE_V1;
+	mce->srr0 = regs->nip;
+	mce->srr1 = regs->msr;
+	mce->gpr3 = regs->gpr[3];
+	mce->in_use = 1;
+
+	mce->initiator = MCE_INITIATOR_CPU;
+	if (handled)
+		mce->disposition = MCE_DISPOSITION_RECOVERED;
+	else
+		mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
+	mce->severity = MCE_SEV_ERROR_SYNC;
+
+	srr1 = regs->msr;
+
+	/*
+	 * Populate the mce error_type and type-specific error_type.
+	 */
+	mce_set_error_info(mce, mce_err);
+
+	if (!addr)
+		return;
+
+	if (mce->error_type == MCE_ERROR_TYPE_TLB) {
+		mce->u.tlb_error.effective_address_provided = true;
+		mce->u.tlb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
+		mce->u.slb_error.effective_address_provided = true;
+		mce->u.slb_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
+		mce->u.erat_error.effective_address_provided = true;
+		mce->u.erat_error.effective_address = addr;
+	} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+		mce->u.ue_error.effective_address_provided = true;
+		mce->u.ue_error.effective_address = addr;
+	}
+	return;
+}
+
+/*
+ * get_mce_event:
+ *	mce	Pointer to machine_check_event structure to be filled.
+ *	release Flag to indicate whether to free the event slot or not.
+ *		0 <= do not release the mce event. Caller will invoke
+ *		     release_mce_event() once event has been consumed.
+ *		1 <= release the slot.
+ *
+ *	return	1 = success
+ *		0 = failure
+ *
+ * get_mce_event() will be called by platform specific machine check
+ * handle routine and in KVM.
+ * When we call get_mce_event(), we are still in interrupt context and
+ * preemption will not be scheduled until ret_from_expect() routine
+ * is called.
+ */
+int get_mce_event(struct machine_check_event *mce, bool release)
+{
+	int index = __get_cpu_var(mce_nest_count) - 1;
+	struct machine_check_event *mc_evt;
+	int ret = 0;
+
+	/* Sanity check */
+	if (index < 0)
+		return ret;
+
+	/* Check if we have MCE info to process. */
+	if (index < MAX_MC_EVT) {
+		mc_evt = &__get_cpu_var(mce_event[index]);
+		/* Copy the event structure and release the original */
+		if (mce)
+			*mce = *mc_evt;
+		if (release)
+			mc_evt->in_use = 0;
+		ret = 1;
+	}
+	/* Decrement the count to free the slot. */
+	if (release)
+		__get_cpu_var(mce_nest_count)--;
+
+	return ret;
+}
+
+void release_mce_event(void)
+{
+	get_mce_event(NULL, true);
+}
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 60a217f11275..b36e777a734f 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -133,22 +133,116 @@ static long mce_handle_ierror_p7(uint64_t srr1)
 	return handled;
 }
 
+static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	switch (P7_SRR1_MC_IFETCH(srr1)) {
+	case P7_SRR1_MC_IFETCH_SLB_PARITY:
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+		break;
+	case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+		break;
+	case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
+		mce_err->error_type = MCE_ERROR_TYPE_TLB;
+		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+		break;
+	case P7_SRR1_MC_IFETCH_UE:
+	case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
+		break;
+	case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type =
+				MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
+		break;
+	}
+}
+
+static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	mce_get_common_ierror(mce_err, srr1);
+	if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+	}
+}
+
+static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+	if (dsisr & P7_DSISR_MC_UE) {
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
+	} else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
+		mce_err->error_type = MCE_ERROR_TYPE_UE;
+		mce_err->u.ue_error_type =
+				MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
+	} else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
+	} else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
+		mce_err->error_type = MCE_ERROR_TYPE_TLB;
+		mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
+	} else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
+		mce_err->error_type = MCE_ERROR_TYPE_SLB;
+		mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
+	}
+}
+
 long __machine_check_early_realmode_p7(struct pt_regs *regs)
 {
-	uint64_t srr1;
+	uint64_t srr1, addr;
 	long handled = 1;
+	struct mce_error_info mce_error_info = { 0 };
 
 	srr1 = regs->msr;
 
-	if (P7_SRR1_MC_LOADSTORE(srr1))
+	/*
+	 * Handle memory errors depending whether this was a load/store or
+	 * ifetch exception. Also, populate the mce error_type and
+	 * type-specific error_type from either SRR1 or DSISR, depending
+	 * whether this was a load/store or ifetch exception
+	 */
+	if (P7_SRR1_MC_LOADSTORE(srr1)) {
 		handled = mce_handle_derror_p7(regs->dsisr);
-	else
+		mce_get_derror_p7(&mce_error_info, regs->dsisr);
+		addr = regs->dar;
+	} else {
 		handled = mce_handle_ierror_p7(srr1);
+		mce_get_ierror_p7(&mce_error_info, srr1);
+		addr = regs->nip;
+	}
 
-	/* TODO: Decode machine check reason. */
+	save_mce_event(regs, handled, &mce_error_info, addr);
 	return handled;
 }
 
+static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
+{
+	mce_get_common_ierror(mce_err, srr1);
+	if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	}
+}
+
+static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
+{
+	mce_get_derror_p7(mce_err, dsisr);
+	if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
+		mce_err->error_type = MCE_ERROR_TYPE_ERAT;
+		mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
+	}
+}
+
 static long mce_handle_ierror_p8(uint64_t srr1)
 {
 	long handled = 0;
@@ -169,16 +263,22 @@ static long mce_handle_derror_p8(uint64_t dsisr)
 
 long __machine_check_early_realmode_p8(struct pt_regs *regs)
 {
-	uint64_t srr1;
+	uint64_t srr1, addr;
 	long handled = 1;
+	struct mce_error_info mce_error_info = { 0 };
 
 	srr1 = regs->msr;
 
-	if (P7_SRR1_MC_LOADSTORE(srr1))
+	if (P7_SRR1_MC_LOADSTORE(srr1)) {
 		handled = mce_handle_derror_p8(regs->dsisr);
-	else
+		mce_get_derror_p8(&mce_error_info, regs->dsisr);
+		addr = regs->dar;
+	} else {
 		handled = mce_handle_ierror_p8(srr1);
+		mce_get_ierror_p8(&mce_error_info, srr1);
+		addr = regs->nip;
+	}
 
-	/* TODO: Decode machine check reason. */
+	save_mce_event(regs, handled, &mce_error_info, addr);
 	return handled;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 5c427b41a2f5..768a9f977c00 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -12,6 +12,7 @@
 #include <linux/kvm_host.h>
 #include <linux/kernel.h>
 #include <asm/opal.h>
+#include <asm/mce.h>
 
 /* SRR1 bits for machine check on POWER7 */
 #define SRR1_MC_LDSTERR		(1ul << (63-42))
@@ -67,9 +68,7 @@ static void reload_slb(struct kvm_vcpu *vcpu)
 static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
 {
 	unsigned long srr1 = vcpu->arch.shregs.msr;
-#ifdef CONFIG_PPC_POWERNV
-	struct opal_machine_check_event *opal_evt;
-#endif
+	struct machine_check_event mce_evt;
 	long handled = 1;
 
 	if (srr1 & SRR1_MC_LDSTERR) {
@@ -109,22 +108,31 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
 		handled = 0;
 	}
 
-#ifdef CONFIG_PPC_POWERNV
 	/*
-	 * See if OPAL has already handled the condition.
-	 * We assume that if the condition is recovered then OPAL
+	 * See if we have already handled the condition in the linux host.
+	 * We assume that if the condition is recovered then linux host
 	 * will have generated an error log event that we will pick
 	 * up and log later.
+	 * Don't release mce event now. In case if condition is not
+	 * recovered we do guest exit and go back to linux host machine
+	 * check handler. Hence we need make sure that current mce event
+	 * is available for linux host to consume.
 	 */
-	opal_evt = local_paca->opal_mc_evt;
-	if (opal_evt->version == OpalMCE_V1 &&
-	    (opal_evt->severity == OpalMCE_SEV_NO_ERROR ||
-	     opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED))
+	if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE))
+		goto out;
+
+	if (mce_evt.version == MCE_V1 &&
+	    (mce_evt.severity == MCE_SEV_NO_ERROR ||
+	     mce_evt.disposition == MCE_DISPOSITION_RECOVERED))
 		handled = 1;
 
+out:
+	/*
+	 * If we have handled the error, then release the mce event because
+	 * we will be delivering machine check to guest.
+	 */
 	if (handled)
-		opal_evt->in_use = 0;
-#endif
+		release_mce_event();
 
 	return handled;
 }
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 1c798cd55372..c5e71d773f47 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -21,6 +21,7 @@
 #include <linux/kobject.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
+#include <asm/mce.h>
 
 #include "powernv.h"
 
@@ -256,8 +257,7 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 
 int opal_machine_check(struct pt_regs *regs)
 {
-	struct opal_machine_check_event *opal_evt = get_paca()->opal_mc_evt;
-	struct opal_machine_check_event evt;
+	struct machine_check_event evt;
 	const char *level, *sevstr, *subtype;
 	static const char *opal_mc_ue_types[] = {
 		"Indeterminate",
@@ -282,30 +282,29 @@ int opal_machine_check(struct pt_regs *regs)
 		"Multihit",
 	};
 
-	/* Copy the event structure and release the original */
-	evt = *opal_evt;
-	opal_evt->in_use = 0;
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return 0;
 
 	/* Print things out */
-	if (evt.version != OpalMCE_V1) {
+	if (evt.version != MCE_V1) {
 		pr_err("Machine Check Exception, Unknown event version %d !\n",
 		       evt.version);
 		return 0;
 	}
 	switch(evt.severity) {
-	case OpalMCE_SEV_NO_ERROR:
+	case MCE_SEV_NO_ERROR:
 		level = KERN_INFO;
 		sevstr = "Harmless";
 		break;
-	case OpalMCE_SEV_WARNING:
+	case MCE_SEV_WARNING:
 		level = KERN_WARNING;
 		sevstr = "";
 		break;
-	case OpalMCE_SEV_ERROR_SYNC:
+	case MCE_SEV_ERROR_SYNC:
 		level = KERN_ERR;
 		sevstr = "Severe";
 		break;
-	case OpalMCE_SEV_FATAL:
+	case MCE_SEV_FATAL:
 	default:
 		level = KERN_ERR;
 		sevstr = "Fatal";
@@ -313,12 +312,12 @@ int opal_machine_check(struct pt_regs *regs)
 	}
 
 	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
-	       evt.disposition == OpalMCE_DISPOSITION_RECOVERED ?
+	       evt.disposition == MCE_DISPOSITION_RECOVERED ?
 	       "Recovered" : "[Not recovered");
 	printk("%s  Initiator: %s\n", level,
-	       evt.initiator == OpalMCE_INITIATOR_CPU ? "CPU" : "Unknown");
+	       evt.initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
 	switch(evt.error_type) {
-	case OpalMCE_ERROR_TYPE_UE:
+	case MCE_ERROR_TYPE_UE:
 		subtype = evt.u.ue_error.ue_error_type <
 			ARRAY_SIZE(opal_mc_ue_types) ?
 			opal_mc_ue_types[evt.u.ue_error.ue_error_type]
@@ -331,7 +330,7 @@ int opal_machine_check(struct pt_regs *regs)
 			printk("%s      Physial address: %016llx\n",
 			       level, evt.u.ue_error.physical_address);
 		break;
-	case OpalMCE_ERROR_TYPE_SLB:
+	case MCE_ERROR_TYPE_SLB:
 		subtype = evt.u.slb_error.slb_error_type <
 			ARRAY_SIZE(opal_mc_slb_types) ?
 			opal_mc_slb_types[evt.u.slb_error.slb_error_type]
@@ -341,7 +340,7 @@ int opal_machine_check(struct pt_regs *regs)
 			printk("%s    Effective address: %016llx\n",
 			       level, evt.u.slb_error.effective_address);
 		break;
-	case OpalMCE_ERROR_TYPE_ERAT:
+	case MCE_ERROR_TYPE_ERAT:
 		subtype = evt.u.erat_error.erat_error_type <
 			ARRAY_SIZE(opal_mc_erat_types) ?
 			opal_mc_erat_types[evt.u.erat_error.erat_error_type]
@@ -351,7 +350,7 @@ int opal_machine_check(struct pt_regs *regs)
 			printk("%s    Effective address: %016llx\n",
 			       level, evt.u.erat_error.effective_address);
 		break;
-	case OpalMCE_ERROR_TYPE_TLB:
+	case MCE_ERROR_TYPE_TLB:
 		subtype = evt.u.tlb_error.tlb_error_type <
 			ARRAY_SIZE(opal_mc_tlb_types) ?
 			opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type]
@@ -362,11 +361,11 @@ int opal_machine_check(struct pt_regs *regs)
 			       level, evt.u.tlb_error.effective_address);
 		break;
 	default:
-	case OpalMCE_ERROR_TYPE_UNKNOWN:
+	case MCE_ERROR_TYPE_UNKNOWN:
 		printk("%s  Error type: Unknown\n", level);
 		break;
 	}
-	return evt.severity == OpalMCE_SEV_FATAL ? 0 : 1;
+	return evt.severity == MCE_SEV_FATAL ? 0 : 1;
 }
 
 static irqreturn_t opal_interrupt(int irq, void *data)
-- 
cgit v1.2.3


From b5ff4211a8294be2ddbaf963fa3666fa042292a8 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:05:49 +0530
Subject: powerpc/book3s: Queue up and process delayed MCE events.

When machine check real mode handler can not continue into host kernel
in V mode, it returns from the interrupt and we loose MCE event which
never gets logged. In such a situation queue up the MCE event so that
we can log it later when we get back into host kernel with r1 pointing to
kernel stack e.g. during syscall exit.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mce.h        |   3 +
 arch/powerpc/kernel/entry_64.S        |   5 ++
 arch/powerpc/kernel/exceptions-64s.S  |   7 +-
 arch/powerpc/kernel/mce.c             | 154 ++++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal.c |  97 +--------------------
 5 files changed, 168 insertions(+), 98 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 87cad2a808c2..3276b409299c 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -190,5 +190,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled,
 			   struct mce_error_info *mce_err, uint64_t addr);
 extern int get_mce_event(struct machine_check_event *mce, bool release);
 extern void release_mce_event(void);
+extern void machine_check_queue_event(void);
+extern void machine_check_process_queued_event(void);
+extern void machine_check_print_event_info(struct machine_check_event *evt);
 
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index bbfb0294b354..770d6d65c47b 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -183,6 +183,11 @@ syscall_exit:
 #ifdef SHOW_SYSCALLS
 	bl	.do_show_syscall_exit
 	ld	r3,RESULT(r1)
+#endif
+#ifdef CONFIG_PPC_BOOK3S_64
+BEGIN_FTR_SECTION
+	bl	.machine_check_process_queued_event
+END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 #endif
 	CURRENT_THREAD_INFO(r12, r1)
 
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 1aec3025eeee..862b9dd4a9db 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -855,7 +855,8 @@ BEGIN_FTR_SECTION
 	/* Supervisor state loss */
 	li	r0,1
 	stb	r0,PACA_NAPSTATELOST(r13)
-3:	MACHINE_CHECK_HANDLER_WINDUP
+3:	bl	.machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
 	GET_PACA(r13)
 	ld	r1,PACAR1(r13)
 	b	.power7_enter_nap_mode
@@ -895,8 +896,10 @@ BEGIN_FTR_SECTION
 2:
 	/*
 	 * Return from MC interrupt.
-	 * TODO: Queue up the MCE event so that we can log it later.
+	 * Queue up the MCE event so that we can log it later, while
+	 * returning from kernel or opal call.
 	 */
+	bl	.machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
 	rfid
 9:
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index aeecdf1ba897..1c6d15701c56 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -31,6 +31,10 @@
 static DEFINE_PER_CPU(int, mce_nest_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
 
+/* Queue for delayed MCE events. */
+static DEFINE_PER_CPU(int, mce_queue_count);
+static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
+
 static void mce_set_error_info(struct machine_check_event *mce,
 			       struct mce_error_info *mce_err)
 {
@@ -162,3 +166,153 @@ void release_mce_event(void)
 {
 	get_mce_event(NULL, true);
 }
+
+/*
+ * Queue up the MCE event which then can be handled later.
+ */
+void machine_check_queue_event(void)
+{
+	int index;
+	struct machine_check_event evt;
+
+	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
+		return;
+
+	index = __get_cpu_var(mce_queue_count)++;
+	/* If queue is full, just return for now. */
+	if (index >= MAX_MC_EVT) {
+		__get_cpu_var(mce_queue_count)--;
+		return;
+	}
+	__get_cpu_var(mce_event_queue[index]) = evt;
+}
+
+/*
+ * process pending MCE event from the mce event queue. This function will be
+ * called during syscall exit.
+ */
+void machine_check_process_queued_event(void)
+{
+	int index;
+
+	preempt_disable();
+	/*
+	 * For now just print it to console.
+	 * TODO: log this error event to FSP or nvram.
+	 */
+	while (__get_cpu_var(mce_queue_count) > 0) {
+		index = __get_cpu_var(mce_queue_count) - 1;
+		machine_check_print_event_info(
+				&__get_cpu_var(mce_event_queue[index]));
+		__get_cpu_var(mce_queue_count)--;
+	}
+	preempt_enable();
+}
+
+void machine_check_print_event_info(struct machine_check_event *evt)
+{
+	const char *level, *sevstr, *subtype;
+	static const char *mc_ue_types[] = {
+		"Indeterminate",
+		"Instruction fetch",
+		"Page table walk ifetch",
+		"Load/Store",
+		"Page table walk Load/Store",
+	};
+	static const char *mc_slb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_erat_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+	static const char *mc_tlb_types[] = {
+		"Indeterminate",
+		"Parity",
+		"Multihit",
+	};
+
+	/* Print things out */
+	if (evt->version != MCE_V1) {
+		pr_err("Machine Check Exception, Unknown event version %d !\n",
+		       evt->version);
+		return;
+	}
+	switch (evt->severity) {
+	case MCE_SEV_NO_ERROR:
+		level = KERN_INFO;
+		sevstr = "Harmless";
+		break;
+	case MCE_SEV_WARNING:
+		level = KERN_WARNING;
+		sevstr = "";
+		break;
+	case MCE_SEV_ERROR_SYNC:
+		level = KERN_ERR;
+		sevstr = "Severe";
+		break;
+	case MCE_SEV_FATAL:
+	default:
+		level = KERN_ERR;
+		sevstr = "Fatal";
+		break;
+	}
+
+	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
+	       evt->disposition == MCE_DISPOSITION_RECOVERED ?
+	       "Recovered" : "[Not recovered");
+	printk("%s  Initiator: %s\n", level,
+	       evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		subtype = evt->u.ue_error.ue_error_type <
+			ARRAY_SIZE(mc_ue_types) ?
+			mc_ue_types[evt->u.ue_error.ue_error_type]
+			: "Unknown";
+		printk("%s  Error type: UE [%s]\n", level, subtype);
+		if (evt->u.ue_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.ue_error.effective_address);
+		if (evt->u.ue_error.physical_address_provided)
+			printk("%s      Physial address: %016llx\n",
+			       level, evt->u.ue_error.physical_address);
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		subtype = evt->u.slb_error.slb_error_type <
+			ARRAY_SIZE(mc_slb_types) ?
+			mc_slb_types[evt->u.slb_error.slb_error_type]
+			: "Unknown";
+		printk("%s  Error type: SLB [%s]\n", level, subtype);
+		if (evt->u.slb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.slb_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		subtype = evt->u.erat_error.erat_error_type <
+			ARRAY_SIZE(mc_erat_types) ?
+			mc_erat_types[evt->u.erat_error.erat_error_type]
+			: "Unknown";
+		printk("%s  Error type: ERAT [%s]\n", level, subtype);
+		if (evt->u.erat_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.erat_error.effective_address);
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		subtype = evt->u.tlb_error.tlb_error_type <
+			ARRAY_SIZE(mc_tlb_types) ?
+			mc_tlb_types[evt->u.tlb_error.tlb_error_type]
+			: "Unknown";
+		printk("%s  Error type: TLB [%s]\n", level, subtype);
+		if (evt->u.tlb_error.effective_address_provided)
+			printk("%s    Effective address: %016llx\n",
+			       level, evt->u.tlb_error.effective_address);
+		break;
+	default:
+	case MCE_ERROR_TYPE_UNKNOWN:
+		printk("%s  Error type: Unknown\n", level);
+		break;
+	}
+}
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index c5e71d773f47..245096f90437 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -258,29 +258,6 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 int opal_machine_check(struct pt_regs *regs)
 {
 	struct machine_check_event evt;
-	const char *level, *sevstr, *subtype;
-	static const char *opal_mc_ue_types[] = {
-		"Indeterminate",
-		"Instruction fetch",
-		"Page table walk ifetch",
-		"Load/Store",
-		"Page table walk Load/Store",
-	};
-	static const char *opal_mc_slb_types[] = {
-		"Indeterminate",
-		"Parity",
-		"Multihit",
-	};
-	static const char *opal_mc_erat_types[] = {
-		"Indeterminate",
-		"Parity",
-		"Multihit",
-	};
-	static const char *opal_mc_tlb_types[] = {
-		"Indeterminate",
-		"Parity",
-		"Multihit",
-	};
 
 	if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
 		return 0;
@@ -291,80 +268,8 @@ int opal_machine_check(struct pt_regs *regs)
 		       evt.version);
 		return 0;
 	}
-	switch(evt.severity) {
-	case MCE_SEV_NO_ERROR:
-		level = KERN_INFO;
-		sevstr = "Harmless";
-		break;
-	case MCE_SEV_WARNING:
-		level = KERN_WARNING;
-		sevstr = "";
-		break;
-	case MCE_SEV_ERROR_SYNC:
-		level = KERN_ERR;
-		sevstr = "Severe";
-		break;
-	case MCE_SEV_FATAL:
-	default:
-		level = KERN_ERR;
-		sevstr = "Fatal";
-		break;
-	}
+	machine_check_print_event_info(&evt);
 
-	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
-	       evt.disposition == MCE_DISPOSITION_RECOVERED ?
-	       "Recovered" : "[Not recovered");
-	printk("%s  Initiator: %s\n", level,
-	       evt.initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
-	switch(evt.error_type) {
-	case MCE_ERROR_TYPE_UE:
-		subtype = evt.u.ue_error.ue_error_type <
-			ARRAY_SIZE(opal_mc_ue_types) ?
-			opal_mc_ue_types[evt.u.ue_error.ue_error_type]
-			: "Unknown";
-		printk("%s  Error type: UE [%s]\n", level, subtype);
-		if (evt.u.ue_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt.u.ue_error.effective_address);
-		if (evt.u.ue_error.physical_address_provided)
-			printk("%s      Physial address: %016llx\n",
-			       level, evt.u.ue_error.physical_address);
-		break;
-	case MCE_ERROR_TYPE_SLB:
-		subtype = evt.u.slb_error.slb_error_type <
-			ARRAY_SIZE(opal_mc_slb_types) ?
-			opal_mc_slb_types[evt.u.slb_error.slb_error_type]
-			: "Unknown";
-		printk("%s  Error type: SLB [%s]\n", level, subtype);
-		if (evt.u.slb_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt.u.slb_error.effective_address);
-		break;
-	case MCE_ERROR_TYPE_ERAT:
-		subtype = evt.u.erat_error.erat_error_type <
-			ARRAY_SIZE(opal_mc_erat_types) ?
-			opal_mc_erat_types[evt.u.erat_error.erat_error_type]
-			: "Unknown";
-		printk("%s  Error type: ERAT [%s]\n", level, subtype);
-		if (evt.u.erat_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt.u.erat_error.effective_address);
-		break;
-	case MCE_ERROR_TYPE_TLB:
-		subtype = evt.u.tlb_error.tlb_error_type <
-			ARRAY_SIZE(opal_mc_tlb_types) ?
-			opal_mc_tlb_types[evt.u.tlb_error.tlb_error_type]
-			: "Unknown";
-		printk("%s  Error type: TLB [%s]\n", level, subtype);
-		if (evt.u.tlb_error.effective_address_provided)
-			printk("%s    Effective address: %016llx\n",
-			       level, evt.u.tlb_error.effective_address);
-		break;
-	default:
-	case MCE_ERROR_TYPE_UNKNOWN:
-		printk("%s  Error type: Unknown\n", level);
-		break;
-	}
 	return evt.severity == MCE_SEV_FATAL ? 0 : 1;
 }
 
-- 
cgit v1.2.3


From b63a0ffe35de7e5f9b907bbc2c783e702f7e15af Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Wed, 30 Oct 2013 20:06:13 +0530
Subject: powerpc/powernv: Machine check exception handling.

Add basic error handling in machine check exception handler.

- If MSR_RI isn't set, we can not recover.
- Check if disposition set to OpalMCE_DISPOSITION_RECOVERED.
- Check if address at fault is inside kernel address space, if not then send
  SIGBUS to process if we hit exception when in userspace.
- If address at fault is not provided then and if we get a synchronous machine
  check while in userspace then kill the task.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mce.h        |  1 +
 arch/powerpc/kernel/mce.c             | 27 ++++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal.c | 43 ++++++++++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 3276b409299c..a2b8c7b35fba 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -193,5 +193,6 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_process_queued_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt);
+extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
 
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 1c6d15701c56..c0c52ec1fca7 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -316,3 +316,30 @@ void machine_check_print_event_info(struct machine_check_event *evt)
 		break;
 	}
 }
+
+uint64_t get_mce_fault_addr(struct machine_check_event *evt)
+{
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		if (evt->u.ue_error.effective_address_provided)
+			return evt->u.ue_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_SLB:
+		if (evt->u.slb_error.effective_address_provided)
+			return evt->u.slb_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_ERAT:
+		if (evt->u.erat_error.effective_address_provided)
+			return evt->u.erat_error.effective_address;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		if (evt->u.tlb_error.effective_address_provided)
+			return evt->u.tlb_error.effective_address;
+		break;
+	default:
+	case MCE_ERROR_TYPE_UNKNOWN:
+		break;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(get_mce_fault_addr);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index f348bd49487c..01e74cbc67e9 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include <linux/kobject.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
@@ -251,6 +252,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len)
 	return written;
 }
 
+static int opal_recover_mce(struct pt_regs *regs,
+					struct machine_check_event *evt)
+{
+	int recovered = 0;
+	uint64_t ea = get_mce_fault_addr(evt);
+
+	if (!(regs->msr & MSR_RI)) {
+		/* If MSR_RI isn't set, we cannot recover */
+		recovered = 0;
+	} else if (evt->disposition == MCE_DISPOSITION_RECOVERED) {
+		/* Platform corrected itself */
+		recovered = 1;
+	} else if (ea && !is_kernel_addr(ea)) {
+		/*
+		 * Faulting address is not in kernel text. We should be fine.
+		 * We need to find which process uses this address.
+		 * For now, kill the task if we have received exception when
+		 * in userspace.
+		 *
+		 * TODO: Queue up this address for hwpoisioning later.
+		 */
+		if (user_mode(regs) && !is_global_init(current)) {
+			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+			recovered = 1;
+		} else
+			recovered = 0;
+	} else if (user_mode(regs) && !is_global_init(current) &&
+		evt->severity == MCE_SEV_ERROR_SYNC) {
+		/*
+		 * If we have received a synchronous error when in userspace
+		 * kill the task.
+		 */
+		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
+		recovered = 1;
+	}
+	return recovered;
+}
+
 int opal_machine_check(struct pt_regs *regs)
 {
 	struct machine_check_event evt;
@@ -266,7 +305,9 @@ int opal_machine_check(struct pt_regs *regs)
 	}
 	machine_check_print_event_info(&evt);
 
-	return evt.severity == MCE_SEV_FATAL ? 0 : 1;
+	if (opal_recover_mce(regs, &evt))
+		return 1;
+	return 0;
 }
 
 static irqreturn_t opal_interrupt(int irq, void *data)
-- 
cgit v1.2.3


From 24366360035a9e0a9870ed7208aa2ba1948f844d Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Mon, 18 Nov 2013 15:35:58 +0530
Subject: powerpc/powernv: Infrastructure to read opal messages in generic
 format.

Opal now has a new messaging infrastructure to push the messages to
linux in a generic format for different type of messages using only one
event bit. The format of the opal message is as below:

struct opal_msg {
        uint32_t msg_type;
	uint32_t reserved;
	uint64_t params[8];
};

This patch allows clients to subscribe for notification for specific
message type. It is upto the subscriber to decipher the messages who showed
interested in receiving specific message type.

The interface to subscribe for notification is:

	int opal_message_notifier_register(enum OpalMessageType msg_type,
                                        struct notifier_block *nb)

The notifier will fetch the opal message when available and notify the
subscriber with message type and the opal message. It is subscribers
responsibility to copy the message data before returning from notifier
callback.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/opal.h                | 24 ++++++-
 arch/powerpc/platforms/powernv/opal-wrappers.S |  2 +
 arch/powerpc/platforms/powernv/opal.c          | 90 ++++++++++++++++++++++++++
 3 files changed, 115 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 033c06be1d84..ffb2036fcfb2 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -132,6 +132,8 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_FLASH_VALIDATE			76
 #define OPAL_FLASH_MANAGE			77
 #define OPAL_FLASH_UPDATE			78
+#define OPAL_GET_MSG				85
+#define OPAL_CHECK_ASYNC_COMPLETION		86
 
 #ifndef __ASSEMBLY__
 
@@ -211,7 +213,16 @@ enum OpalPendingState {
 	OPAL_EVENT_ERROR_LOG		= 0x40,
 	OPAL_EVENT_EPOW			= 0x80,
 	OPAL_EVENT_LED_STATUS		= 0x100,
-	OPAL_EVENT_PCI_ERROR		= 0x200
+	OPAL_EVENT_PCI_ERROR		= 0x200,
+	OPAL_EVENT_MSG_PENDING		= 0x800,
+};
+
+enum OpalMessageType {
+	OPAL_MSG_ASYNC_COMP		= 0,
+	OPAL_MSG_MEM_ERR,
+	OPAL_MSG_EPOW,
+	OPAL_MSG_SHUTDOWN,
+	OPAL_MSG_TYPE_MAX,
 };
 
 /* Machine check related definitions */
@@ -356,6 +367,12 @@ enum OpalLPCAddressType {
 	OPAL_LPC_FW	= 2,
 };
 
+struct opal_msg {
+	uint32_t msg_type;
+	uint32_t reserved;
+	uint64_t params[8];
+};
+
 struct opal_machine_check_event {
 	enum OpalMCE_Version	version:8;	/* 0x00 */
 	uint8_t			in_use;		/* 0x01 */
@@ -731,6 +748,9 @@ int64_t opal_validate_flash(uint64_t buffer, uint32_t *size, uint32_t *result);
 int64_t opal_manage_flash(uint8_t op);
 int64_t opal_update_flash(uint64_t blk_list);
 
+int64_t opal_get_msg(uint64_t buffer, size_t size);
+int64_t opal_check_completion(uint64_t buffer, size_t size, uint64_t token);
+
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
 
@@ -744,6 +764,8 @@ extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
 				   int depth, void *data);
 
 extern int opal_notifier_register(struct notifier_block *nb);
+extern int opal_message_notifier_register(enum OpalMessageType msg_type,
+						struct notifier_block *nb);
 extern void opal_notifier_enable(void);
 extern void opal_notifier_disable(void);
 extern void opal_notifier_update_evt(uint64_t evt_mask, uint64_t evt_val);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index e7806504e976..719aa5c325c6 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -126,3 +126,5 @@ OPAL_CALL(opal_return_cpu,			OPAL_RETURN_CPU);
 OPAL_CALL(opal_validate_flash,			OPAL_FLASH_VALIDATE);
 OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
+OPAL_CALL(opal_get_msg,				OPAL_GET_MSG);
+OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 01e74cbc67e9..7a184a0ff183 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -40,6 +40,7 @@ extern u64 opal_mc_secondary_handler[];
 static unsigned int *opal_irqs;
 static unsigned int opal_irq_count;
 static ATOMIC_NOTIFIER_HEAD(opal_notifier_head);
+static struct atomic_notifier_head opal_msg_notifier_head[OPAL_MSG_TYPE_MAX];
 static DEFINE_SPINLOCK(opal_notifier_lock);
 static uint64_t last_notified_mask = 0x0ul;
 static atomic_t opal_notifier_hold = ATOMIC_INIT(0);
@@ -167,6 +168,95 @@ void opal_notifier_disable(void)
 	atomic_set(&opal_notifier_hold, 1);
 }
 
+/*
+ * Opal message notifier based on message type. Allow subscribers to get
+ * notified for specific messgae type.
+ */
+int opal_message_notifier_register(enum OpalMessageType msg_type,
+					struct notifier_block *nb)
+{
+	if (!nb) {
+		pr_warning("%s: Invalid argument (%p)\n",
+			   __func__, nb);
+		return -EINVAL;
+	}
+	if (msg_type > OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Invalid message type argument (%d)\n",
+			   __func__, msg_type);
+		return -EINVAL;
+	}
+	return atomic_notifier_chain_register(
+				&opal_msg_notifier_head[msg_type], nb);
+}
+
+static void opal_message_do_notify(uint32_t msg_type, void *msg)
+{
+	/* notify subscribers */
+	atomic_notifier_call_chain(&opal_msg_notifier_head[msg_type],
+					msg_type, msg);
+}
+
+static void opal_handle_message(void)
+{
+	s64 ret;
+	/*
+	 * TODO: pre-allocate a message buffer depending on opal-msg-size
+	 * value in /proc/device-tree.
+	 */
+	static struct opal_msg msg;
+
+	ret = opal_get_msg(__pa(&msg), sizeof(msg));
+	/* No opal message pending. */
+	if (ret == OPAL_RESOURCE)
+		return;
+
+	/* check for errors. */
+	if (ret) {
+		pr_warning("%s: Failed to retrive opal message, err=%lld\n",
+				__func__, ret);
+		return;
+	}
+
+	/* Sanity check */
+	if (msg.msg_type > OPAL_MSG_TYPE_MAX) {
+		pr_warning("%s: Unknown message type: %u\n",
+				__func__, msg.msg_type);
+		return;
+	}
+	opal_message_do_notify(msg.msg_type, (void *)&msg);
+}
+
+static int opal_message_notify(struct notifier_block *nb,
+			  unsigned long events, void *change)
+{
+	if (events & OPAL_EVENT_MSG_PENDING)
+		opal_handle_message();
+	return 0;
+}
+
+static struct notifier_block opal_message_nb = {
+	.notifier_call	= opal_message_notify,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_message_init(void)
+{
+	int ret, i;
+
+	for (i = 0; i < OPAL_MSG_TYPE_MAX; i++)
+		ATOMIC_INIT_NOTIFIER_HEAD(&opal_msg_notifier_head[i]);
+
+	ret = opal_notifier_register(&opal_message_nb);
+	if (ret) {
+		pr_err("%s: Can't register OPAL event notifier (%d)\n",
+		       __func__, ret);
+		return ret;
+	}
+	return 0;
+}
+early_initcall(opal_message_init);
+
 int opal_get_chars(uint32_t vtermno, char *buf, int count)
 {
 	s64 rc;
-- 
cgit v1.2.3


From 7e1ce5a492e18449fd47ef6305b26e0c572d26e9 Mon Sep 17 00:00:00 2001
From: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Date: Mon, 18 Nov 2013 16:39:22 +0530
Subject: powerpc/powernv: Move SG list structure to header file

Move SG list and entry structure to header file so that
it can be used in other places as well.

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/opal.h             | 22 ++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal-flash.c | 31 ++++++-----------------------
 2 files changed, 28 insertions(+), 25 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ffb2036fcfb2..0a2ac85998d7 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -33,6 +33,28 @@ struct opal_takeover_args {
 	u64	rd_loc;			/* r11 */
 };
 
+/*
+ * SG entry
+ *
+ * WARNING: The current implementation requires each entry
+ * to represent a block that is 4k aligned *and* each block
+ * size except the last one in the list to be as well.
+ */
+struct opal_sg_entry {
+	void    *data;
+	long    length;
+};
+
+/* sg list */
+struct opal_sg_list {
+	unsigned long num_entries;
+	struct opal_sg_list *next;
+	struct opal_sg_entry entry[];
+};
+
+/* We calculate number of sg entries based on PAGE_SIZE */
+#define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry))
+
 extern long opal_query_takeover(u64 *hal_size, u64 *hal_align);
 
 extern long opal_do_takeover(struct opal_takeover_args *args);
diff --git a/arch/powerpc/platforms/powernv/opal-flash.c b/arch/powerpc/platforms/powernv/opal-flash.c
index 6ffa6b1ec5b7..4aeae4f36e1d 100644
--- a/arch/powerpc/platforms/powernv/opal-flash.c
+++ b/arch/powerpc/platforms/powernv/opal-flash.c
@@ -103,27 +103,6 @@ struct image_header_t {
 	uint32_t	size;
 };
 
-/* Scatter/gather entry */
-struct opal_sg_entry {
-	void	*data;
-	long	length;
-};
-
-/* We calculate number of entries based on PAGE_SIZE */
-#define SG_ENTRIES_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct opal_sg_entry))
-
-/*
- * This struct is very similar but not identical to that
- * needed by the opal flash update. All we need to do for
- * opal is rewrite num_entries into a version/length and
- * translate the pointers to absolute.
- */
-struct opal_sg_list {
-	unsigned long num_entries;
-	struct opal_sg_list *next;
-	struct opal_sg_entry entry[SG_ENTRIES_PER_NODE];
-};
-
 struct validate_flash_t {
 	int		status;		/* Return status */
 	void		*buf;		/* Candiate image buffer */
@@ -333,7 +312,7 @@ static struct opal_sg_list *image_data_to_sglist(void)
 	addr = image_data.data;
 	size = image_data.size;
 
-	sg1 = kzalloc((sizeof(struct opal_sg_list)), GFP_KERNEL);
+	sg1 = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!sg1)
 		return NULL;
 
@@ -351,8 +330,7 @@ static struct opal_sg_list *image_data_to_sglist(void)
 
 		sg1->num_entries++;
 		if (sg1->num_entries >= SG_ENTRIES_PER_NODE) {
-			sg1->next = kzalloc((sizeof(struct opal_sg_list)),
-					    GFP_KERNEL);
+			sg1->next = kzalloc(PAGE_SIZE, GFP_KERNEL);
 			if (!sg1->next) {
 				pr_err("%s : Failed to allocate memory\n",
 				       __func__);
@@ -402,7 +380,10 @@ static int opal_flash_update(int op)
 		else
 			sg->next = NULL;
 
-		/* Make num_entries into the version/length field */
+		/*
+		 * Convert num_entries to version/length format
+		 * to satisfy OPAL.
+		 */
 		sg->num_entries = (SG_LIST_VERSION << 56) |
 			(sg->num_entries * sizeof(struct opal_sg_entry) + 16);
 	}
-- 
cgit v1.2.3


From d905c5df9aef38d63df268f6f5e7b13894f626d3 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Thu, 21 Nov 2013 17:43:14 +1100
Subject: PPC: POWERNV: move iommu_add_device earlier

The current implementation of IOMMU on sPAPR does not use iommu_ops
and therefore does not call IOMMU API's bus_set_iommu() which
1) sets iommu_ops for a bus
2) registers a bus notifier
Instead, PCI devices are added to IOMMU groups from
subsys_initcall_sync(tce_iommu_init) which does basically the same
thing without using iommu_ops callbacks.

However Freescale PAMU driver (https://lkml.org/lkml/2013/7/1/158)
implements iommu_ops and when tce_iommu_init is called, every PCI device
is already added to some group so there is a conflict.

This patch does 2 things:
1. removes the loop in which PCI devices were added to groups and
adds explicit iommu_add_device() calls to add devices as soon as they get
the iommu_table pointer assigned to them.
2. moves a bus notifier to powernv code in order to avoid conflict with
the notifier from Freescale driver.

iommu_add_device() and iommu_del_device() are public now.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/iommu.h            | 26 ++++++++++++++++
 arch/powerpc/kernel/iommu.c                 | 48 +++--------------------------
 arch/powerpc/platforms/powernv/pci-ioda.c   |  8 ++---
 arch/powerpc/platforms/powernv/pci-p5ioc2.c |  2 +-
 arch/powerpc/platforms/powernv/pci.c        | 33 +++++++++++++++++++-
 arch/powerpc/platforms/pseries/iommu.c      |  8 +++--
 6 files changed, 72 insertions(+), 53 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index c34656a8925e..774fa2776907 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -101,8 +101,34 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
 					    int nid);
+#ifdef CONFIG_IOMMU_API
 extern void iommu_register_group(struct iommu_table *tbl,
 				 int pci_domain_number, unsigned long pe_num);
+extern int iommu_add_device(struct device *dev);
+extern void iommu_del_device(struct device *dev);
+#else
+static inline void iommu_register_group(struct iommu_table *tbl,
+					int pci_domain_number,
+					unsigned long pe_num)
+{
+}
+
+static inline int iommu_add_device(struct device *dev)
+{
+	return 0;
+}
+
+static inline void iommu_del_device(struct device *dev)
+{
+}
+#endif /* !CONFIG_IOMMU_API */
+
+static inline void set_iommu_table_base_and_group(struct device *dev,
+						  void *base)
+{
+	set_iommu_table_base(dev, base);
+	iommu_add_device(dev);
+}
 
 extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 			struct scatterlist *sglist, int nelems,
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 572bb5b95f35..d22abe0b08e3 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1105,7 +1105,7 @@ void iommu_release_ownership(struct iommu_table *tbl)
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-static int iommu_add_device(struct device *dev)
+int iommu_add_device(struct device *dev)
 {
 	struct iommu_table *tbl;
 	int ret = 0;
@@ -1134,52 +1134,12 @@ static int iommu_add_device(struct device *dev)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(iommu_add_device);
 
-static void iommu_del_device(struct device *dev)
+void iommu_del_device(struct device *dev)
 {
 	iommu_group_remove_device(dev);
 }
-
-static int iommu_bus_notifier(struct notifier_block *nb,
-			      unsigned long action, void *data)
-{
-	struct device *dev = data;
-
-	switch (action) {
-	case BUS_NOTIFY_ADD_DEVICE:
-		return iommu_add_device(dev);
-	case BUS_NOTIFY_DEL_DEVICE:
-		iommu_del_device(dev);
-		return 0;
-	default:
-		return 0;
-	}
-}
-
-static struct notifier_block tce_iommu_bus_nb = {
-	.notifier_call = iommu_bus_notifier,
-};
-
-static int __init tce_iommu_init(void)
-{
-	struct pci_dev *pdev = NULL;
-
-	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
-
-	for_each_pci_dev(pdev)
-		iommu_add_device(&pdev->dev);
-
-	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
-	return 0;
-}
-
-subsys_initcall_sync(tce_iommu_init);
-
-#else
-
-void iommu_register_group(struct iommu_table *tbl,
-		int pci_domain_number, unsigned long pe_num)
-{
-}
+EXPORT_SYMBOL_GPL(iommu_del_device);
 
 #endif /* CONFIG_IOMMU_API */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 084cdfa40682..614356cac466 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -460,7 +460,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
 		return;
 
 	pe = &phb->ioda.pe_array[pdn->pe_number];
-	set_iommu_table_base(&pdev->dev, &pe->tce32_table);
+	set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
 }
 
 static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
@@ -468,7 +468,7 @@ static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
 	struct pci_dev *dev;
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
-		set_iommu_table_base(&dev->dev, &pe->tce32_table);
+		set_iommu_table_base_and_group(&dev->dev, &pe->tce32_table);
 		if (dev->subordinate)
 			pnv_ioda_setup_bus_dma(pe, dev->subordinate);
 	}
@@ -644,7 +644,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
 	iommu_register_group(tbl, pci_domain_nr(pe->pbus), pe->pe_number);
 
 	if (pe->pdev)
-		set_iommu_table_base(&pe->pdev->dev, tbl);
+		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
 	else
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 
@@ -722,7 +722,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
 	iommu_init_table(tbl, phb->hose->node);
 
 	if (pe->pdev)
-		set_iommu_table_base(&pe->pdev->dev, tbl);
+		set_iommu_table_base_and_group(&pe->pdev->dev, tbl);
 	else
 		pnv_ioda_setup_bus_dma(pe, pe->pbus);
 
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
index f8b4bd8afb2e..e3807d69393e 100644
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
@@ -92,7 +92,7 @@ static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
 				pci_domain_nr(phb->hose->bus), phb->opal_id);
 	}
 
-	set_iommu_table_base(&pdev->dev, &phb->p5ioc2.iommu_table);
+	set_iommu_table_base_and_group(&pdev->dev, &phb->p5ioc2.iommu_table);
 }
 
 static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 4eb33a9ed532..6f3d49c18d64 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -536,7 +536,7 @@ static void pnv_pci_dma_fallback_setup(struct pci_controller *hose,
 		pdn->iommu_table = pnv_pci_setup_bml_iommu(hose);
 	if (!pdn->iommu_table)
 		return;
-	set_iommu_table_base(&pdev->dev, pdn->iommu_table);
+	set_iommu_table_base_and_group(&pdev->dev, pdn->iommu_table);
 }
 
 static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
@@ -657,3 +657,34 @@ void __init pnv_pci_init(void)
 	ppc_md.teardown_msi_irqs = pnv_teardown_msi_irqs;
 #endif
 }
+
+static int tce_iommu_bus_notifier(struct notifier_block *nb,
+		unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	switch (action) {
+	case BUS_NOTIFY_ADD_DEVICE:
+		return iommu_add_device(dev);
+	case BUS_NOTIFY_DEL_DEVICE:
+		if (dev->iommu_group)
+			iommu_del_device(dev);
+		return 0;
+	default:
+		return 0;
+	}
+}
+
+static struct notifier_block tce_iommu_bus_nb = {
+	.notifier_call = tce_iommu_bus_notifier,
+};
+
+static int __init tce_iommu_bus_notifier_init(void)
+{
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+
+	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
+	return 0;
+}
+
+subsys_initcall_sync(tce_iommu_bus_notifier_init);
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index f253361552ae..a80af6c20cba 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -687,7 +687,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		iommu_table_setparms(phb, dn, tbl);
 		PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node);
 		iommu_register_group(tbl, pci_domain_nr(phb->bus), 0);
-		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+		set_iommu_table_base_and_group(&dev->dev,
+					       PCI_DN(dn)->iommu_table);
 		return;
 	}
 
@@ -699,7 +700,8 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
 		dn = dn->parent;
 
 	if (dn && PCI_DN(dn))
-		set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table);
+		set_iommu_table_base_and_group(&dev->dev,
+					       PCI_DN(dn)->iommu_table);
 	else
 		printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
 		       pci_name(dev));
@@ -1193,7 +1195,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
 		pr_debug("  found DMA window, table: %p\n", pci->iommu_table);
 	}
 
-	set_iommu_table_base(&dev->dev, pci->iommu_table);
+	set_iommu_table_base_and_group(&dev->dev, pci->iommu_table);
 }
 
 static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask)
-- 
cgit v1.2.3


From 0150a3dd92fc45b599bf2442b47d40921b4aa2d2 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 29 Nov 2013 13:27:18 +1100
Subject: powerpc: Add real mode cache inhibited IO accessors

These accessors allow us to do cache inhibited accesses when in real
mode. They should only be used in real mode.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/io.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 575fbf81fad0..97d3869991ca 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -191,8 +191,24 @@ DEF_MMIO_OUT_D(out_le32, 32, stw);
 
 #endif /* __BIG_ENDIAN */
 
+/*
+ * Cache inhibitied accessors for use in real mode, you don't want to use these
+ * unless you know what you're doing.
+ *
+ * NB. These use the cpu byte ordering.
+ */
+DEF_MMIO_OUT_X(out_rm8,   8, stbcix);
+DEF_MMIO_OUT_X(out_rm16, 16, sthcix);
+DEF_MMIO_OUT_X(out_rm32, 32, stwcix);
+DEF_MMIO_IN_X(in_rm8,   8, lbzcix);
+DEF_MMIO_IN_X(in_rm16, 16, lhzcix);
+DEF_MMIO_IN_X(in_rm32, 32, lwzcix);
+
 #ifdef __powerpc64__
 
+DEF_MMIO_OUT_X(out_rm64, 64, stdcix);
+DEF_MMIO_IN_X(in_rm64, 64, ldcix);
+
 #ifdef __BIG_ENDIAN__
 DEF_MMIO_OUT_D(out_be64, 64, std);
 DEF_MMIO_IN_D(in_be64, 64, ld);
-- 
cgit v1.2.3


From 1a8f6f97ea4dbaaa21b05cae2dacea47e4aea37b Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@ozlabs.org>
Date: Thu, 5 Dec 2013 11:31:08 +0800
Subject: powerpc: Make slb_shadow a local

The only external user of slb_shadow is the pseries lpar code, and it
can access through the paca array instead.

Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/lppaca.h     | 2 --
 arch/powerpc/kernel/paca.c            | 2 +-
 arch/powerpc/platforms/pseries/lpar.c | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h
index 844c28de7ec0..d0a2a2f99564 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -132,8 +132,6 @@ struct slb_shadow {
 	} save_area[SLB_NUM_BOLTED];
 } ____cacheline_aligned;
 
-extern struct slb_shadow slb_shadow[];
-
 /*
  * Layout of entries in the hypervisor's dispatch trace log buffer.
  */
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 0620eaaaad45..9095a6f7ac2c 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -99,7 +99,7 @@ static inline void free_lppacas(void) { }
  * 3 persistent SLBs are registered here.  The buffer will be zero
  * initially, hence will all be invaild until we actually write them.
  */
-struct slb_shadow slb_shadow[] __cacheline_aligned = {
+static struct slb_shadow slb_shadow[] __cacheline_aligned = {
 	[0 ... (NR_CPUS-1)] = {
 		.persistent = cpu_to_be32(SLB_NUM_BOLTED),
 		.buffer_length = cpu_to_be32(sizeof(struct slb_shadow)),
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 4fca3def9db9..28cf0f33c5be 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -92,7 +92,7 @@ void vpa_init(int cpu)
 	 * PAPR says this feature is SLB-Buffer but firmware never
 	 * reports that.  All SPLPAR support SLB shadow buffer.
 	 */
-	addr = __pa(&slb_shadow[cpu]);
+	addr = __pa(paca[cpu].slb_shadow_ptr);
 	if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
 		ret = register_slb_shadow(hwcpu, addr);
 		if (ret)
-- 
cgit v1.2.3


From c8c06f5a0dde0fed260c54d550962187f266ed0d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 18 Nov 2013 14:58:10 +0530
Subject: powerpc/mm: Free up _PAGE_COHERENCE for numa fault use later

Set  memory coherence always on hash64 config. If
a platform cannot have memory coherence always set they
can infer that from _PAGE_NO_CACHE and _PAGE_WRITETHRU
like in lpar. So we dont' really need a separate bit
for tracking _PAGE_COHERENCE.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pte-hash64.h |  2 +-
 arch/powerpc/mm/hash_low_64.S         | 15 ++++++++++++---
 arch/powerpc/mm/hash_utils_64.c       |  7 ++++---
 arch/powerpc/mm/hugepage-hash64.c     |  6 +++++-
 arch/powerpc/mm/hugetlbpage-hash64.c  |  4 ++++
 5 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 0419eeb53274..55aea0caf95e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -19,7 +19,7 @@
 #define _PAGE_FILE		0x0002 /* (!present only) software: pte holds file offset */
 #define _PAGE_EXEC		0x0004 /* No execute on POWER4 and newer (we invert) */
 #define _PAGE_GUARDED		0x0008
-#define _PAGE_COHERENT		0x0010 /* M: enforce memory coherence (SMP systems) */
+/* We can derive Memory coherence from _PAGE_NO_CACHE */
 #define _PAGE_NO_CACHE		0x0020 /* I: cache inhibit */
 #define _PAGE_WRITETHRU		0x0040 /* W: cache write-through */
 #define _PAGE_DIRTY		0x0080 /* C: page changed */
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index d3cbda62857b..1136d26a95ae 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...) 
@@ -457,7 +460,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r3,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
@@ -795,7 +801,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	and	r0,r0,r4		/* _PAGE_RW & _PAGE_DIRTY ->r0 bit 30*/
 	andc	r0,r30,r0		/* r0 = pte & ~r0 */
 	rlwimi	r3,r0,32-1,31,31	/* Insert result into PP lsb */
-	ori	r3,r3,HPTE_R_C		/* Always add "C" bit for perf. */
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	ori	r3,r3,HPTE_R_C | HPTE_R_M
 
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6176b3cdf579..de6881259aef 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -169,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long pteflags)
 	if ((pteflags & _PAGE_USER) && !((pteflags & _PAGE_RW) &&
 					 (pteflags & _PAGE_DIRTY)))
 		rflags |= 1;
-
-	/* Always add C */
-	return rflags | HPTE_R_C;
+	/*
+	 * Always add "C" bit for perf. Memory coherence is always enabled
+	 */
+	return rflags | HPTE_R_C | HPTE_R_M;
 }
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 34de9e0cdc34..826893fcb3a7 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -127,7 +127,11 @@ repeat:
 
 		/* Add in WIMG bits */
 		rflags |= (new_pmd & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
-				      _PAGE_COHERENT | _PAGE_GUARDED));
+				      _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
 
 		/* Insert into the hash table, primary slot */
 		slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index 0b7fb6761015..a5bcf9301196 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -99,6 +99,10 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		/* Add in WIMG bits */
 		rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
 				      _PAGE_COHERENT | _PAGE_GUARDED));
+		/*
+		 * enable the memory coherence always
+		 */
+		rflags |= HPTE_R_M;
 
 		slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0,
 					     mmu_psize, ssize);
-- 
cgit v1.2.3


From c34a51ce49b40b9667cd7f5cc2e40475af8b4c3d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 18 Nov 2013 14:58:13 +0530
Subject: powerpc/mm: Enable _PAGE_NUMA for book3s

We steal the _PAGE_COHERENCE bit and use that for indicating NUMA ptes.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pgtable.h     | 66 +++++++++++++++++++++++++++++++++-
 arch/powerpc/include/asm/pte-hash64.h  |  6 ++++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 3 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7d6eacf249cf..b999ca318985 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -3,6 +3,7 @@
 #ifdef __KERNEL__
 
 #ifndef __ASSEMBLY__
+#include <linux/mmdebug.h>
 #include <asm/processor.h>		/* For TASK_SIZE */
 #include <asm/mmu.h>
 #include <asm/page.h>
@@ -33,10 +34,73 @@ static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
 static inline int pte_special(pte_t pte)	{ return pte_val(pte) & _PAGE_SPECIAL; }
-static inline int pte_present(pte_t pte)	{ return pte_val(pte) & _PAGE_PRESENT; }
 static inline int pte_none(pte_t pte)		{ return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; }
 static inline pgprot_t pte_pgprot(pte_t pte)	{ return __pgprot(pte_val(pte) & PAGE_PROT_BITS); }
 
+#ifdef CONFIG_NUMA_BALANCING
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA);
+}
+
+#define pte_numa pte_numa
+static inline int pte_numa(pte_t pte)
+{
+	return (pte_val(pte) &
+		(_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+
+#define pte_mknonnuma pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+	pte_val(pte) &= ~_PAGE_NUMA;
+	pte_val(pte) |=  _PAGE_PRESENT | _PAGE_ACCESSED;
+	return pte;
+}
+
+#define pte_mknuma pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+	/*
+	 * We should not set _PAGE_NUMA on non present ptes. Also clear the
+	 * present bit so that hash_page will return 1 and we collect this
+	 * as numa fault.
+	 */
+	if (pte_present(pte)) {
+		pte_val(pte) |= _PAGE_NUMA;
+		pte_val(pte) &= ~_PAGE_PRESENT;
+	} else
+		VM_BUG_ON(1);
+	return pte;
+}
+
+#define pmd_numa pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+	return pte_numa(pmd_pte(pmd));
+}
+
+#define pmd_mknonnuma pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+	return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+}
+
+#define pmd_mknuma pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+	return pte_pmd(pte_mknuma(pmd_pte(pmd)));
+}
+
+# else
+
+static inline int pte_present(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_PRESENT;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  *
diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h
index 55aea0caf95e..2505d8eab15c 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,6 +27,12 @@
 #define _PAGE_RW		0x0200 /* software: user write access allowed */
 #define _PAGE_BUSY		0x0800 /* software: PTE & hash are busy */
 
+/*
+ * Used for tracking numa faults
+ */
+#define _PAGE_NUMA	0x00000010 /* Gather numa placement stats */
+
+
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW		(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */
 #define _PAGE_KERNEL_RO		 _PAGE_KERNEL_RW
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index bca2465a9c34..434fda39bf8b 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -72,6 +72,7 @@ config PPC_BOOK3S_64
 	select PPC_HAVE_PMU_SUPPORT
 	select SYS_SUPPORTS_HUGETLBFS
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
+	select ARCH_SUPPORTS_NUMA_BALANCING
 
 config PPC_BOOK3E_64
 	bool "Embedded processors"
-- 
cgit v1.2.3


From 75eb3d9b60c280a275099fbed06a890cee2d784b Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Fri, 15 Nov 2013 09:50:57 +0530
Subject: powerpc/powernv: Get FSP memory errors and plumb into memory poison
 infrastructure.

Get the memory errors reported by opal and plumb it into memory poison
infrastructure. This patch uses new messaging channel infrastructure to
pull the fsp memory errors to linux.

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/opal.h                    |  52 ++++++++
 arch/powerpc/platforms/powernv/Makefile            |   1 +
 .../powerpc/platforms/powernv/opal-memory-errors.c | 146 +++++++++++++++++++++
 3 files changed, 199 insertions(+)
 create mode 100644 arch/powerpc/platforms/powernv/opal-memory-errors.c

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 0a2ac85998d7..aded1b81bfd6 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -443,6 +443,58 @@ struct opal_machine_check_event {
 	} u;
 };
 
+/* FSP memory errors handling */
+enum OpalMemErr_Version {
+	OpalMemErr_V1 = 1,
+};
+
+enum OpalMemErrType {
+	OPAL_MEM_ERR_TYPE_RESILIENCE	= 0,
+	OPAL_MEM_ERR_TYPE_DYN_DALLOC,
+	OPAL_MEM_ERR_TYPE_SCRUB,
+};
+
+/* Memory Reilience error type */
+enum OpalMemErr_ResilErrType {
+	OPAL_MEM_RESILIENCE_CE		= 0,
+	OPAL_MEM_RESILIENCE_UE,
+	OPAL_MEM_RESILIENCE_UE_SCRUB,
+};
+
+/* Dynamic Memory Deallocation type */
+enum OpalMemErr_DynErrType {
+	OPAL_MEM_DYNAMIC_DEALLOC	= 0,
+};
+
+/* OpalMemoryErrorData->flags */
+#define OPAL_MEM_CORRECTED_ERROR	0x0001
+#define OPAL_MEM_THRESHOLD_EXCEEDED	0x0002
+#define OPAL_MEM_ACK_REQUIRED		0x8000
+
+struct OpalMemoryErrorData {
+	enum OpalMemErr_Version	version:8;	/* 0x00 */
+	enum OpalMemErrType	type:8;		/* 0x01 */
+	uint16_t		flags;		/* 0x02 */
+	uint8_t			reserved_1[4];	/* 0x04 */
+
+	union {
+		/* Memory Resilience corrected/uncorrected error info */
+		struct {
+			enum OpalMemErr_ResilErrType resil_err_type:8;
+			uint8_t		reserved_1[7];
+			uint64_t	physical_address_start;
+			uint64_t	physical_address_end;
+		} resilience;
+		/* Dynamic memory deallocation error info */
+		struct {
+			enum OpalMemErr_DynErrType dyn_err_type:8;
+			uint8_t		reserved_1[7];
+			uint64_t	physical_address_start;
+			uint64_t	physical_address_end;
+		} dyn_dealloc;
+	} u;
+};
+
 enum {
 	OPAL_P7IOC_DIAG_TYPE_NONE	= 0,
 	OPAL_P7IOC_DIAG_TYPE_RGC	= 1,
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index 873fa1370dc4..8d767fde5a6a 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o
 obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
+obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c b/arch/powerpc/platforms/powernv/opal-memory-errors.c
new file mode 100644
index 000000000000..ec4132239cdf
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -0,0 +1,146 @@
+/*
+ * OPAL asynchronus Memory error handling support in PowreNV.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright 2013 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include <asm/opal.h>
+#include <asm/cputable.h>
+
+static int opal_mem_err_nb_init;
+static LIST_HEAD(opal_memory_err_list);
+static DEFINE_SPINLOCK(opal_mem_err_lock);
+
+struct OpalMsgNode {
+	struct list_head list;
+	struct opal_msg msg;
+};
+
+static void handle_memory_error_event(struct OpalMemoryErrorData *merr_evt)
+{
+	uint64_t paddr_start, paddr_end;
+
+	pr_debug("%s: Retrived memory error event, type: 0x%x\n",
+		  __func__, merr_evt->type);
+	switch (merr_evt->type) {
+	case OPAL_MEM_ERR_TYPE_RESILIENCE:
+		paddr_start = merr_evt->u.resilience.physical_address_start;
+		paddr_end = merr_evt->u.resilience.physical_address_end;
+		break;
+	case OPAL_MEM_ERR_TYPE_DYN_DALLOC:
+		paddr_start = merr_evt->u.dyn_dealloc.physical_address_start;
+		paddr_end = merr_evt->u.dyn_dealloc.physical_address_end;
+		break;
+	default:
+		return;
+	}
+
+	for (; paddr_start < paddr_end; paddr_start += PAGE_SIZE) {
+		memory_failure(paddr_start >> PAGE_SHIFT, 0, 0);
+	}
+}
+
+static void handle_memory_error(void)
+{
+	unsigned long flags;
+	struct OpalMemoryErrorData *merr_evt;
+	struct OpalMsgNode *msg_node;
+
+	spin_lock_irqsave(&opal_mem_err_lock, flags);
+	while (!list_empty(&opal_memory_err_list)) {
+		 msg_node = list_entry(opal_memory_err_list.next,
+					   struct OpalMsgNode, list);
+		list_del(&msg_node->list);
+		spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+		merr_evt = (struct OpalMemoryErrorData *)
+					&msg_node->msg.params[0];
+		handle_memory_error_event(merr_evt);
+		kfree(msg_node);
+		spin_lock_irqsave(&opal_mem_err_lock, flags);
+	}
+	spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+}
+
+static void mem_error_handler(struct work_struct *work)
+{
+	handle_memory_error();
+}
+
+static DECLARE_WORK(mem_error_work, mem_error_handler);
+
+/*
+ * opal_memory_err_event - notifier handler that queues up the opal message
+ * to be preocessed later.
+ */
+static int opal_memory_err_event(struct notifier_block *nb,
+			  unsigned long msg_type, void *msg)
+{
+	unsigned long flags;
+	struct OpalMsgNode *msg_node;
+
+	if (msg_type != OPAL_MSG_MEM_ERR)
+		return 0;
+
+	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
+	if (!msg_node) {
+		pr_err("MEMORY_ERROR: out of memory, Opal message event not"
+		       "handled\n");
+		return -ENOMEM;
+	}
+	memcpy(&msg_node->msg, msg, sizeof(struct opal_msg));
+
+	spin_lock_irqsave(&opal_mem_err_lock, flags);
+	list_add(&msg_node->list, &opal_memory_err_list);
+	spin_unlock_irqrestore(&opal_mem_err_lock, flags);
+
+	schedule_work(&mem_error_work);
+	return 0;
+}
+
+static struct notifier_block opal_mem_err_nb = {
+	.notifier_call	= opal_memory_err_event,
+	.next		= NULL,
+	.priority	= 0,
+};
+
+static int __init opal_mem_err_init(void)
+{
+	int ret;
+
+	if (!opal_mem_err_nb_init) {
+		ret = opal_message_notifier_register(
+					OPAL_MSG_MEM_ERR, &opal_mem_err_nb);
+		if (ret) {
+			pr_err("%s: Can't register OPAL event notifier (%d)\n",
+			       __func__, ret);
+			return ret;
+		}
+		opal_mem_err_nb_init = 1;
+	}
+	return 0;
+}
+subsys_initcall(opal_mem_err_init);
-- 
cgit v1.2.3


From e3fec2f74f7f90d2149a24243a4d040caabe6f30 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 17 Dec 2013 21:26:19 -0500
Subject: lib: Add missing arch generic-y entries for asm-generic/hash.h

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/alpha/include/asm/Kbuild      | 1 +
 arch/arc/include/asm/Kbuild        | 1 +
 arch/arm/include/asm/Kbuild        | 1 +
 arch/arm64/include/asm/Kbuild      | 1 +
 arch/avr32/include/asm/Kbuild      | 1 +
 arch/blackfin/include/asm/Kbuild   | 1 +
 arch/c6x/include/asm/Kbuild        | 1 +
 arch/cris/include/asm/Kbuild       | 1 +
 arch/frv/include/asm/Kbuild        | 1 +
 arch/hexagon/include/asm/Kbuild    | 1 +
 arch/ia64/include/asm/Kbuild       | 3 ++-
 arch/m32r/include/asm/Kbuild       | 1 +
 arch/m68k/include/asm/Kbuild       | 1 +
 arch/metag/include/asm/Kbuild      | 1 +
 arch/microblaze/include/asm/Kbuild | 1 +
 arch/mips/include/asm/Kbuild       | 1 +
 arch/mn10300/include/asm/Kbuild    | 1 +
 arch/openrisc/include/asm/Kbuild   | 1 +
 arch/parisc/include/asm/Kbuild     | 1 +
 arch/powerpc/include/asm/Kbuild    | 3 ++-
 arch/s390/include/asm/Kbuild       | 1 +
 arch/score/include/asm/Kbuild      | 2 ++
 arch/sh/include/asm/Kbuild         | 1 +
 arch/sparc/include/asm/Kbuild      | 1 +
 arch/tile/include/asm/Kbuild       | 1 +
 arch/um/include/asm/Kbuild         | 1 +
 arch/unicore32/include/asm/Kbuild  | 1 +
 arch/xtensa/include/asm/Kbuild     | 1 +
 28 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild
index f01fb505ad52..a73a8e208a4a 100644
--- a/arch/alpha/include/asm/Kbuild
+++ b/arch/alpha/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += clkdev.h
 generic-y += exec.h
 generic-y += trace_clock.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild
index 5943f7f9d325..93e6ca919620 100644
--- a/arch/arc/include/asm/Kbuild
+++ b/arch/arc/include/asm/Kbuild
@@ -47,3 +47,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
index c38b58c80202..3278afe2c3ab 100644
--- a/arch/arm/include/asm/Kbuild
+++ b/arch/arm/include/asm/Kbuild
@@ -34,3 +34,4 @@ generic-y += timex.h
 generic-y += trace_clock.h
 generic-y += unaligned.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild
index 519f89f5b6a3..626d4a92521f 100644
--- a/arch/arm64/include/asm/Kbuild
+++ b/arch/arm64/include/asm/Kbuild
@@ -51,3 +51,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/avr32/include/asm/Kbuild b/arch/avr32/include/asm/Kbuild
index 658001b52400..cfb9fe1b8df9 100644
--- a/arch/avr32/include/asm/Kbuild
+++ b/arch/avr32/include/asm/Kbuild
@@ -18,3 +18,4 @@ generic-y       += sections.h
 generic-y       += topology.h
 generic-y	+= trace_clock.h
 generic-y       += xor.h
+generic-y	+= hash.h
diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild
index f2b43474b0e2..359d36fdc247 100644
--- a/arch/blackfin/include/asm/Kbuild
+++ b/arch/blackfin/include/asm/Kbuild
@@ -45,3 +45,4 @@ generic-y += unaligned.h
 generic-y += user.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild
index fc0b3c356027..d73bb85ccdd3 100644
--- a/arch/c6x/include/asm/Kbuild
+++ b/arch/c6x/include/asm/Kbuild
@@ -57,3 +57,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild
index b06caf649a95..c5963b3e4624 100644
--- a/arch/cris/include/asm/Kbuild
+++ b/arch/cris/include/asm/Kbuild
@@ -12,3 +12,4 @@ generic-y += trace_clock.h
 generic-y += vga.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild
index 74742dc6a3da..bc42f14c9c2e 100644
--- a/arch/frv/include/asm/Kbuild
+++ b/arch/frv/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
 generic-y += exec.h
 generic-y += trace_clock.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild
index 67c3450309b7..469d223950ff 100644
--- a/arch/hexagon/include/asm/Kbuild
+++ b/arch/hexagon/include/asm/Kbuild
@@ -54,3 +54,4 @@ generic-y += ucontext.h
 generic-y += unaligned.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild
index f93ee087e8fe..283a83154b5e 100644
--- a/arch/ia64/include/asm/Kbuild
+++ b/arch/ia64/include/asm/Kbuild
@@ -4,4 +4,5 @@ generic-y += exec.h
 generic-y += kvm_para.h
 generic-y += trace_clock.h
 generic-y += preempt.h
-generic-y += vtime.h
\ No newline at end of file
+generic-y += vtime.h
+generic-y += hash.h
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild
index 2b58c5f0bc38..932435ac4e5c 100644
--- a/arch/m32r/include/asm/Kbuild
+++ b/arch/m32r/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += exec.h
 generic-y += module.h
 generic-y += trace_clock.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild
index a5d27f272a59..7cc8c364924d 100644
--- a/arch/m68k/include/asm/Kbuild
+++ b/arch/m68k/include/asm/Kbuild
@@ -32,3 +32,4 @@ generic-y += types.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild
index 84d0c1d6b9b3..b716d807c2ec 100644
--- a/arch/metag/include/asm/Kbuild
+++ b/arch/metag/include/asm/Kbuild
@@ -53,3 +53,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild
index ce0bbf8f5640..43eec338ff50 100644
--- a/arch/microblaze/include/asm/Kbuild
+++ b/arch/microblaze/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += exec.h
 generic-y += trace_clock.h
 generic-y += syscalls.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild
index 1acbb8b77a71..2d7f65052c1f 100644
--- a/arch/mips/include/asm/Kbuild
+++ b/arch/mips/include/asm/Kbuild
@@ -14,3 +14,4 @@ generic-y += trace_clock.h
 generic-y += preempt.h
 generic-y += ucontext.h
 generic-y += xor.h
+generic-y += hash.h
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild
index 74742dc6a3da..bc42f14c9c2e 100644
--- a/arch/mn10300/include/asm/Kbuild
+++ b/arch/mn10300/include/asm/Kbuild
@@ -3,3 +3,4 @@ generic-y += clkdev.h
 generic-y += exec.h
 generic-y += trace_clock.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild
index da1951a22907..2e40f1ca8667 100644
--- a/arch/openrisc/include/asm/Kbuild
+++ b/arch/openrisc/include/asm/Kbuild
@@ -69,3 +69,4 @@ generic-y += vga.h
 generic-y += word-at-a-time.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild
index a603b9ebe54c..75edd5fcc6ff 100644
--- a/arch/parisc/include/asm/Kbuild
+++ b/arch/parisc/include/asm/Kbuild
@@ -5,3 +5,4 @@ generic-y += word-at-a-time.h auxvec.h user.h cputime.h emergency-restart.h \
 	  poll.h xor.h clkdev.h exec.h
 generic-y += trace_clock.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index d8f9d2f18a23..6c0a955a1b06 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -3,4 +3,5 @@ generic-y += clkdev.h
 generic-y += rwsem.h
 generic-y += trace_clock.h
 generic-y += preempt.h
-generic-y += vtime.h
\ No newline at end of file
+generic-y += vtime.h
+generic-y += hash.h
diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild
index 7a5288f3479a..8386a4a1f19a 100644
--- a/arch/s390/include/asm/Kbuild
+++ b/arch/s390/include/asm/Kbuild
@@ -3,3 +3,4 @@
 generic-y += clkdev.h
 generic-y += trace_clock.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild
index f3414ade77a3..099e7ba40599 100644
--- a/arch/score/include/asm/Kbuild
+++ b/arch/score/include/asm/Kbuild
@@ -5,3 +5,5 @@ generic-y += clkdev.h
 generic-y += trace_clock.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
+
diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild
index 231efbb68108..0cd7198a4524 100644
--- a/arch/sh/include/asm/Kbuild
+++ b/arch/sh/include/asm/Kbuild
@@ -35,3 +35,4 @@ generic-y += trace_clock.h
 generic-y += ucontext.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild
index bf390667657a..4b60a0c325ec 100644
--- a/arch/sparc/include/asm/Kbuild
+++ b/arch/sparc/include/asm/Kbuild
@@ -17,3 +17,4 @@ generic-y += trace_clock.h
 generic-y += types.h
 generic-y += word-at-a-time.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild
index 22f3bd147fa7..3793c75e45d9 100644
--- a/arch/tile/include/asm/Kbuild
+++ b/arch/tile/include/asm/Kbuild
@@ -39,3 +39,4 @@ generic-y += trace_clock.h
 generic-y += types.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index fdde187e6087..75de4abe4f94 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -4,3 +4,4 @@ generic-y += ftrace.h pci.h io.h param.h delay.h mutex.h current.h exec.h
 generic-y += switch_to.h clkdev.h
 generic-y += trace_clock.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
index 00045cbe5c63..3ef4f9d9bf5d 100644
--- a/arch/unicore32/include/asm/Kbuild
+++ b/arch/unicore32/include/asm/Kbuild
@@ -61,3 +61,4 @@ generic-y += user.h
 generic-y += vga.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild
index 228d6aee3a16..d7efa1502101 100644
--- a/arch/xtensa/include/asm/Kbuild
+++ b/arch/xtensa/include/asm/Kbuild
@@ -29,3 +29,4 @@ generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += xor.h
 generic-y += preempt.h
+generic-y += hash.h
-- 
cgit v1.2.3


From e589a4404fa06730355de204d3d136ed9bbc7dea Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 9 Dec 2013 18:17:01 +1100
Subject: powerpc/iommu: Update constant names to reflect their hardcoded page
 size

The powerpc iommu uses a hardcoded page size of 4K. This patch changes
the name of the IOMMU_PAGE_* macros to reflect the hardcoded values. A
future patch will use the existing names to support dynamic page
sizes.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/iommu.h       | 10 ++---
 arch/powerpc/kernel/dma-iommu.c        |  4 +-
 arch/powerpc/kernel/iommu.c            | 78 +++++++++++++++++-----------------
 arch/powerpc/kernel/vio.c              | 19 +++++----
 arch/powerpc/platforms/cell/iommu.c    | 12 +++---
 arch/powerpc/platforms/powernv/pci.c   |  4 +-
 arch/powerpc/platforms/pseries/iommu.c |  8 ++--
 arch/powerpc/platforms/pseries/setup.c |  4 +-
 arch/powerpc/platforms/wsp/wsp_pci.c   | 10 ++---
 drivers/net/ethernet/ibm/ibmveth.c     |  9 ++--
 drivers/vfio/vfio_iommu_spapr_tce.c    | 28 ++++++------
 11 files changed, 94 insertions(+), 92 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 774fa2776907..0869c7e74421 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -30,10 +30,10 @@
 #include <asm/machdep.h>
 #include <asm/types.h>
 
-#define IOMMU_PAGE_SHIFT      12
-#define IOMMU_PAGE_SIZE       (ASM_CONST(1) << IOMMU_PAGE_SHIFT)
-#define IOMMU_PAGE_MASK       (~((1 << IOMMU_PAGE_SHIFT) - 1))
-#define IOMMU_PAGE_ALIGN(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE)
+#define IOMMU_PAGE_SHIFT_4K      12
+#define IOMMU_PAGE_SIZE_4K       (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K)
+#define IOMMU_PAGE_MASK_4K       (~((1 << IOMMU_PAGE_SHIFT_4K) - 1))
+#define IOMMU_PAGE_ALIGN_4K(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE_4K)
 
 /* Boot time flags */
 extern int iommu_is_off;
@@ -42,7 +42,7 @@ extern int iommu_force_on;
 /* Pure 2^n version of get_order */
 static __inline__ __attribute_const__ int get_iommu_order(unsigned long size)
 {
-	return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT) + 1;
+	return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT_4K) + 1;
 }
 
 
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index e4897523de41..5cfe3dbfc4a0 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -83,10 +83,10 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask)
 		return 0;
 	}
 
-	if (tbl->it_offset > (mask >> IOMMU_PAGE_SHIFT)) {
+	if (tbl->it_offset > (mask >> IOMMU_PAGE_SHIFT_4K)) {
 		dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
 		dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
-				mask, tbl->it_offset << IOMMU_PAGE_SHIFT);
+				mask, tbl->it_offset << IOMMU_PAGE_SHIFT_4K);
 		return 0;
 	} else
 		return 1;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index d22abe0b08e3..df4a7f1b7444 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -251,14 +251,14 @@ again:
 
 	if (dev)
 		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << IOMMU_PAGE_SHIFT);
+				      1 << IOMMU_PAGE_SHIFT_4K);
 	else
-		boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT);
+		boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT_4K);
 	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
 
 	n = iommu_area_alloc(tbl->it_map, limit, start, npages,
-			     tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
-			     align_mask);
+			tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT_4K,
+			align_mask);
 	if (n == -1) {
 		if (likely(pass == 0)) {
 			/* First try the pool from the start */
@@ -320,12 +320,12 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 		return DMA_ERROR_CODE;
 
 	entry += tbl->it_offset;	/* Offset into real TCE table */
-	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
+	ret = entry << IOMMU_PAGE_SHIFT_4K;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
 	build_fail = ppc_md.tce_build(tbl, entry, npages,
-	                              (unsigned long)page & IOMMU_PAGE_MASK,
-	                              direction, attrs);
+				(unsigned long)page & IOMMU_PAGE_MASK_4K,
+				direction, attrs);
 
 	/* ppc_md.tce_build() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
@@ -352,7 +352,7 @@ static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
 {
 	unsigned long entry, free_entry;
 
-	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	entry = dma_addr >> IOMMU_PAGE_SHIFT_4K;
 	free_entry = entry - tbl->it_offset;
 
 	if (((free_entry + npages) > tbl->it_size) ||
@@ -401,7 +401,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	unsigned long flags;
 	struct iommu_pool *pool;
 
-	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	entry = dma_addr >> IOMMU_PAGE_SHIFT_4K;
 	free_entry = entry - tbl->it_offset;
 
 	pool = get_pool(tbl, free_entry);
@@ -468,13 +468,13 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		}
 		/* Allocate iommu entries for that segment */
 		vaddr = (unsigned long) sg_virt(s);
-		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE);
+		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE_4K);
 		align = 0;
-		if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && slen >= PAGE_SIZE &&
+		if (IOMMU_PAGE_SHIFT_4K < PAGE_SHIFT && slen >= PAGE_SIZE &&
 		    (vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT_4K;
 		entry = iommu_range_alloc(dev, tbl, npages, &handle,
-					  mask >> IOMMU_PAGE_SHIFT, align);
+					  mask >> IOMMU_PAGE_SHIFT_4K, align);
 
 		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
 
@@ -489,16 +489,16 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 
 		/* Convert entry to a dma_addr_t */
 		entry += tbl->it_offset;
-		dma_addr = entry << IOMMU_PAGE_SHIFT;
-		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK);
+		dma_addr = entry << IOMMU_PAGE_SHIFT_4K;
+		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK_4K);
 
 		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
 		build_fail = ppc_md.tce_build(tbl, entry, npages,
-		                              vaddr & IOMMU_PAGE_MASK,
-		                              direction, attrs);
+					vaddr & IOMMU_PAGE_MASK_4K,
+					direction, attrs);
 		if(unlikely(build_fail))
 			goto failure;
 
@@ -559,9 +559,9 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		if (s->dma_length != 0) {
 			unsigned long vaddr, npages;
 
-			vaddr = s->dma_address & IOMMU_PAGE_MASK;
+			vaddr = s->dma_address & IOMMU_PAGE_MASK_4K;
 			npages = iommu_num_pages(s->dma_address, s->dma_length,
-						 IOMMU_PAGE_SIZE);
+						 IOMMU_PAGE_SIZE_4K);
 			__iommu_free(tbl, vaddr, npages);
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
@@ -592,7 +592,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		if (sg->dma_length == 0)
 			break;
 		npages = iommu_num_pages(dma_handle, sg->dma_length,
-					 IOMMU_PAGE_SIZE);
+					 IOMMU_PAGE_SIZE_4K);
 		__iommu_free(tbl, dma_handle, npages);
 		sg = sg_next(sg);
 	}
@@ -676,7 +676,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 		set_bit(0, tbl->it_map);
 
 	/* We only split the IOMMU table if we have 1GB or more of space */
-	if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024))
+	if ((tbl->it_size << IOMMU_PAGE_SHIFT_4K) >= (1UL * 1024 * 1024 * 1024))
 		tbl->nr_pools = IOMMU_NR_POOLS;
 	else
 		tbl->nr_pools = 1;
@@ -768,16 +768,16 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 
 	vaddr = page_address(page) + offset;
 	uaddr = (unsigned long)vaddr;
-	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE);
+	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE_4K);
 
 	if (tbl) {
 		align = 0;
-		if (IOMMU_PAGE_SHIFT < PAGE_SHIFT && size >= PAGE_SIZE &&
+		if (IOMMU_PAGE_SHIFT_4K < PAGE_SHIFT && size >= PAGE_SIZE &&
 		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT;
+			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT_4K;
 
 		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
-					 mask >> IOMMU_PAGE_SHIFT, align,
+					 mask >> IOMMU_PAGE_SHIFT_4K, align,
 					 attrs);
 		if (dma_handle == DMA_ERROR_CODE) {
 			if (printk_ratelimit())  {
@@ -786,7 +786,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 					 npages);
 			}
 		} else
-			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK);
+			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK_4K);
 	}
 
 	return dma_handle;
@@ -801,7 +801,7 @@ void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
 	BUG_ON(direction == DMA_NONE);
 
 	if (tbl) {
-		npages = iommu_num_pages(dma_handle, size, IOMMU_PAGE_SIZE);
+		npages = iommu_num_pages(dma_handle, size, IOMMU_PAGE_SIZE_4K);
 		iommu_free(tbl, dma_handle, npages);
 	}
 }
@@ -845,10 +845,10 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 	memset(ret, 0, size);
 
 	/* Set up tces to cover the allocated range */
-	nio_pages = size >> IOMMU_PAGE_SHIFT;
+	nio_pages = size >> IOMMU_PAGE_SHIFT_4K;
 	io_order = get_iommu_order(size);
 	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
-			      mask >> IOMMU_PAGE_SHIFT, io_order, NULL);
+			      mask >> IOMMU_PAGE_SHIFT_4K, io_order, NULL);
 	if (mapping == DMA_ERROR_CODE) {
 		free_pages((unsigned long)ret, order);
 		return NULL;
@@ -864,7 +864,7 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		unsigned int nio_pages;
 
 		size = PAGE_ALIGN(size);
-		nio_pages = size >> IOMMU_PAGE_SHIFT;
+		nio_pages = size >> IOMMU_PAGE_SHIFT_4K;
 		iommu_free(tbl, dma_handle, nio_pages);
 		size = PAGE_ALIGN(size);
 		free_pages((unsigned long)vaddr, get_order(size));
@@ -935,10 +935,10 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl,
 	if (tce_value)
 		return -EINVAL;
 
-	if (ioba & ~IOMMU_PAGE_MASK)
+	if (ioba & ~IOMMU_PAGE_MASK_4K)
 		return -EINVAL;
 
-	ioba >>= IOMMU_PAGE_SHIFT;
+	ioba >>= IOMMU_PAGE_SHIFT_4K;
 	if (ioba < tbl->it_offset)
 		return -EINVAL;
 
@@ -955,13 +955,13 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
 		return -EINVAL;
 
-	if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
+	if (tce & ~(IOMMU_PAGE_MASK_4K | TCE_PCI_WRITE | TCE_PCI_READ))
 		return -EINVAL;
 
-	if (ioba & ~IOMMU_PAGE_MASK)
+	if (ioba & ~IOMMU_PAGE_MASK_4K)
 		return -EINVAL;
 
-	ioba >>= IOMMU_PAGE_SHIFT;
+	ioba >>= IOMMU_PAGE_SHIFT_4K;
 	if (ioba < tbl->it_offset)
 		return -EINVAL;
 
@@ -1037,7 +1037,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	/* if (unlikely(ret))
 		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
-				__func__, hwaddr, entry << IOMMU_PAGE_SHIFT,
+				__func__, hwaddr, entry << IOMMU_PAGE_SHIFT_4K,
 				hwaddr, ret); */
 
 	return ret;
@@ -1049,14 +1049,14 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
 {
 	int ret;
 	struct page *page = NULL;
-	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK & ~PAGE_MASK;
+	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK_4K & ~PAGE_MASK;
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
 	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
 			direction != DMA_TO_DEVICE, &page);
 	if (unlikely(ret != 1)) {
 		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
-				tce, entry << IOMMU_PAGE_SHIFT, ret); */
+				tce, entry << IOMMU_PAGE_SHIFT_4K, ret); */
 		return -EFAULT;
 	}
 	hwaddr = (unsigned long) page_address(page) + offset;
@@ -1067,7 +1067,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
 
 	if (ret < 0)
 		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
-				__func__, entry << IOMMU_PAGE_SHIFT, tce, ret);
+			__func__, entry << IOMMU_PAGE_SHIFT_4K, tce, ret);
 
 	return ret;
 }
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 76a64821f4a2..2e89fa350763 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -520,14 +520,14 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
 	struct vio_dev *viodev = to_vio_dev(dev);
 	dma_addr_t ret = DMA_ERROR_CODE;
 
-	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) {
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE_4K))) {
 		atomic_inc(&viodev->cmo.allocs_failed);
 		return ret;
 	}
 
 	ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
 	if (unlikely(dma_mapping_error(dev, ret))) {
-		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE_4K));
 		atomic_inc(&viodev->cmo.allocs_failed);
 	}
 
@@ -543,7 +543,7 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
 
 	dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
 
-	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE));
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE_4K));
 }
 
 static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -556,7 +556,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 	size_t alloc_size = 0;
 
 	for (sgl = sglist; count < nelems; count++, sgl++)
-		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE);
+		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE_4K);
 
 	if (vio_cmo_alloc(viodev, alloc_size)) {
 		atomic_inc(&viodev->cmo.allocs_failed);
@@ -572,7 +572,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 	}
 
 	for (sgl = sglist, count = 0; count < ret; count++, sgl++)
-		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE_4K);
 	if (alloc_size)
 		vio_cmo_dealloc(viodev, alloc_size);
 
@@ -590,7 +590,7 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
 	int count = 0;
 
 	for (sgl = sglist; count < nelems; count++, sgl++)
-		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE);
+		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE_4K);
 
 	dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
 
@@ -736,7 +736,8 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev)
 			return -EINVAL;
 		}
 
-		viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev));
+		viodev->cmo.desired =
+			IOMMU_PAGE_ALIGN_4K(viodrv->get_desired_dma(viodev));
 		if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
 			viodev->cmo.desired = VIO_CMO_MIN_ENT;
 		size = VIO_CMO_MIN_ENT;
@@ -1176,9 +1177,9 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 			    &tbl->it_index, &offset, &size);
 
 	/* TCE table size - measured in tce entries */
-	tbl->it_size = size >> IOMMU_PAGE_SHIFT;
+	tbl->it_size = size >> IOMMU_PAGE_SHIFT_4K;
 	/* offset for VIO should always be 0 */
-	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
+	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT_4K;
 	tbl->it_busno = 0;
 	tbl->it_type = TCE_VB;
 	tbl->it_blocksize = 16;
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index b53560660b72..fc61b908eaf0 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -197,7 +197,7 @@ static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
 
 	io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset);
 
-	for (i = 0; i < npages; i++, uaddr += IOMMU_PAGE_SIZE)
+	for (i = 0; i < npages; i++, uaddr += IOMMU_PAGE_SIZE_4K)
 		io_pte[i] = base_pte | (__pa(uaddr) & CBE_IOPTE_RPN_Mask);
 
 	mb();
@@ -430,7 +430,7 @@ static void cell_iommu_setup_hardware(struct cbe_iommu *iommu,
 {
 	cell_iommu_setup_stab(iommu, base, size, 0, 0);
 	iommu->ptab = cell_iommu_alloc_ptab(iommu, base, size, 0, 0,
-					    IOMMU_PAGE_SHIFT);
+					    IOMMU_PAGE_SHIFT_4K);
 	cell_iommu_enable_hardware(iommu);
 }
 
@@ -487,8 +487,8 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_blocksize = 16;
 	window->table.it_base = (unsigned long)iommu->ptab;
 	window->table.it_index = iommu->nid;
-	window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT) + pte_offset;
-	window->table.it_size = size >> IOMMU_PAGE_SHIFT;
+	window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT_4K) + pte_offset;
+	window->table.it_size = size >> IOMMU_PAGE_SHIFT_4K;
 
 	iommu_init_table(&window->table, iommu->nid);
 
@@ -773,7 +773,7 @@ static void __init cell_iommu_init_one(struct device_node *np,
 
 	/* Setup the iommu_table */
 	cell_iommu_setup_window(iommu, np, base, size,
-				offset >> IOMMU_PAGE_SHIFT);
+				offset >> IOMMU_PAGE_SHIFT_4K);
 }
 
 static void __init cell_disable_iommus(void)
@@ -1122,7 +1122,7 @@ static int __init cell_iommu_fixed_mapping_init(void)
 
 		cell_iommu_setup_stab(iommu, dbase, dsize, fbase, fsize);
 		iommu->ptab = cell_iommu_alloc_ptab(iommu, dbase, dsize, 0, 0,
-						    IOMMU_PAGE_SHIFT);
+						    IOMMU_PAGE_SHIFT_4K);
 		cell_iommu_setup_fixed_ptab(iommu, np, dbase, dsize,
 					     fbase, fsize);
 		cell_iommu_enable_hardware(iommu);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index bac289aac7cc..7f4d857668a9 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -564,7 +564,7 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 {
 	tbl->it_blocksize = 16;
 	tbl->it_base = (unsigned long)tce_mem;
-	tbl->it_offset = dma_offset >> IOMMU_PAGE_SHIFT;
+	tbl->it_offset = dma_offset >> IOMMU_PAGE_SHIFT_4K;
 	tbl->it_index = 0;
 	tbl->it_size = tce_size >> 3;
 	tbl->it_busno = 0;
@@ -761,7 +761,7 @@ static struct notifier_block tce_iommu_bus_nb = {
 
 static int __init tce_iommu_bus_notifier_init(void)
 {
-	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE);
+	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE_4K);
 
 	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
 	return 0;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index a80af6c20cba..1b7531ce0c0c 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -488,7 +488,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
 	tbl->it_busno = phb->bus->number;
 
 	/* Units of tce entries */
-	tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT;
+	tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT_4K;
 
 	/* Test if we are going over 2GB of DMA space */
 	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
@@ -499,7 +499,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
 	phb->dma_window_base_cur += phb->dma_window_size;
 
 	/* Set the tce table size - measured in entries */
-	tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT;
+	tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT_4K;
 
 	tbl->it_index = 0;
 	tbl->it_blocksize = 16;
@@ -540,8 +540,8 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 	tbl->it_base   = 0;
 	tbl->it_blocksize  = 16;
 	tbl->it_type = TCE_PCI;
-	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT;
-	tbl->it_size = size >> IOMMU_PAGE_SHIFT;
+	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT_4K;
+	tbl->it_size = size >> IOMMU_PAGE_SHIFT_4K;
 }
 
 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index c1f190858701..49cd16e6b450 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -72,7 +72,7 @@
 
 int CMO_PrPSP = -1;
 int CMO_SecPSP = -1;
-unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT);
+unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT_4K);
 EXPORT_SYMBOL(CMO_PageSize);
 
 int fwnmi_active;  /* TRUE if an FWNMI handler is present */
@@ -569,7 +569,7 @@ void pSeries_cmo_feature_init(void)
 {
 	char *ptr, *key, *value, *end;
 	int call_status;
-	int page_order = IOMMU_PAGE_SHIFT;
+	int page_order = IOMMU_PAGE_SHIFT_4K;
 
 	pr_debug(" -> fw_cmo_feature_init()\n");
 	spin_lock(&rtas_data_buf_lock);
diff --git a/arch/powerpc/platforms/wsp/wsp_pci.c b/arch/powerpc/platforms/wsp/wsp_pci.c
index 62cb527493e7..8a589618551e 100644
--- a/arch/powerpc/platforms/wsp/wsp_pci.c
+++ b/arch/powerpc/platforms/wsp/wsp_pci.c
@@ -260,7 +260,7 @@ static int tce_build_wsp(struct iommu_table *tbl, long index, long npages,
 		*tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
 
 		dma_debug("[DMA] TCE %p set to 0x%016llx (dma addr: 0x%lx)\n",
-			  tcep, *tcep, (tbl->it_offset + index) << IOMMU_PAGE_SHIFT);
+			  tcep, *tcep, (tbl->it_offset + index) << IOMMU_PAGE_SHIFT_4K);
 
 		uaddr += TCE_PAGE_SIZE;
 		index++;
@@ -381,8 +381,8 @@ static struct wsp_dma_table *wsp_pci_create_dma32_table(struct wsp_phb *phb,
 
 	/* Init bits and pieces */
 	tbl->table.it_blocksize = 16;
-	tbl->table.it_offset = addr >> IOMMU_PAGE_SHIFT;
-	tbl->table.it_size = size >> IOMMU_PAGE_SHIFT;
+	tbl->table.it_offset = addr >> IOMMU_PAGE_SHIFT_4K;
+	tbl->table.it_size = size >> IOMMU_PAGE_SHIFT_4K;
 
 	/*
 	 * It's already blank but we clear it anyway.
@@ -449,8 +449,8 @@ static void wsp_pci_dma_dev_setup(struct pci_dev *pdev)
 	if (table) {
 		pr_info("%s: Setup iommu: 32-bit DMA region 0x%08lx..0x%08lx\n",
 			pci_name(pdev),
-			table->table.it_offset << IOMMU_PAGE_SHIFT,
-			(table->table.it_offset << IOMMU_PAGE_SHIFT)
+			table->table.it_offset << IOMMU_PAGE_SHIFT_4K,
+			(table->table.it_offset << IOMMU_PAGE_SHIFT_4K)
 			+ phb->dma32_region_size - 1);
 		archdata->dma_data.iommu_table_base = &table->table;
 		return;
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 952d795230a4..f7d7538b6bd9 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1282,24 +1282,25 @@ static unsigned long ibmveth_get_desired_dma(struct vio_dev *vdev)
 
 	/* netdev inits at probe time along with the structures we need below*/
 	if (netdev == NULL)
-		return IOMMU_PAGE_ALIGN(IBMVETH_IO_ENTITLEMENT_DEFAULT);
+		return IOMMU_PAGE_ALIGN_4K(IBMVETH_IO_ENTITLEMENT_DEFAULT);
 
 	adapter = netdev_priv(netdev);
 
 	ret = IBMVETH_BUFF_LIST_SIZE + IBMVETH_FILT_LIST_SIZE;
-	ret += IOMMU_PAGE_ALIGN(netdev->mtu);
+	ret += IOMMU_PAGE_ALIGN_4K(netdev->mtu);
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
 		/* add the size of the active receive buffers */
 		if (adapter->rx_buff_pool[i].active)
 			ret +=
 			    adapter->rx_buff_pool[i].size *
-			    IOMMU_PAGE_ALIGN(adapter->rx_buff_pool[i].
+			    IOMMU_PAGE_ALIGN_4K(adapter->rx_buff_pool[i].
 			            buff_size);
 		rxqentries += adapter->rx_buff_pool[i].size;
 	}
 	/* add the size of the receive queue entries */
-	ret += IOMMU_PAGE_ALIGN(rxqentries * sizeof(struct ibmveth_rx_q_entry));
+	ret += IOMMU_PAGE_ALIGN_4K(
+		rxqentries * sizeof(struct ibmveth_rx_q_entry));
 
 	return ret;
 }
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index bdae7a04af75..a84788ba662c 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -81,7 +81,7 @@ static int tce_iommu_enable(struct tce_container *container)
 	 * enforcing the limit based on the max that the guest can map.
 	 */
 	down_write(&current->mm->mmap_sem);
-	npages = (tbl->it_size << IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+	npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
 	locked = current->mm->locked_vm + npages;
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
@@ -110,7 +110,7 @@ static void tce_iommu_disable(struct tce_container *container)
 
 	down_write(&current->mm->mmap_sem);
 	current->mm->locked_vm -= (container->tbl->it_size <<
-			IOMMU_PAGE_SHIFT) >> PAGE_SHIFT;
+			IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
 	up_write(&current->mm->mmap_sem);
 }
 
@@ -174,8 +174,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (info.argsz < minsz)
 			return -EINVAL;
 
-		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT;
-		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT;
+		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
+		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
 		info.flags = 0;
 
 		if (copy_to_user((void __user *)arg, &info, minsz))
@@ -205,8 +205,8 @@ static long tce_iommu_ioctl(void *iommu_data,
 				VFIO_DMA_MAP_FLAG_WRITE))
 			return -EINVAL;
 
-		if ((param.size & ~IOMMU_PAGE_MASK) ||
-				(param.vaddr & ~IOMMU_PAGE_MASK))
+		if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
+				(param.vaddr & ~IOMMU_PAGE_MASK_4K))
 			return -EINVAL;
 
 		/* iova is checked by the IOMMU API */
@@ -220,17 +220,17 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (ret)
 			return ret;
 
-		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT); ++i) {
+		for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
 			ret = iommu_put_tce_user_mode(tbl,
-					(param.iova >> IOMMU_PAGE_SHIFT) + i,
+					(param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
 					tce);
 			if (ret)
 				break;
-			tce += IOMMU_PAGE_SIZE;
+			tce += IOMMU_PAGE_SIZE_4K;
 		}
 		if (ret)
 			iommu_clear_tces_and_put_pages(tbl,
-					param.iova >> IOMMU_PAGE_SHIFT,	i);
+					param.iova >> IOMMU_PAGE_SHIFT_4K, i);
 
 		iommu_flush_tce(tbl);
 
@@ -256,17 +256,17 @@ static long tce_iommu_ioctl(void *iommu_data,
 		if (param.flags)
 			return -EINVAL;
 
-		if (param.size & ~IOMMU_PAGE_MASK)
+		if (param.size & ~IOMMU_PAGE_MASK_4K)
 			return -EINVAL;
 
 		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
-				param.size >> IOMMU_PAGE_SHIFT);
+				param.size >> IOMMU_PAGE_SHIFT_4K);
 		if (ret)
 			return ret;
 
 		ret = iommu_clear_tces_and_put_pages(tbl,
-				param.iova >> IOMMU_PAGE_SHIFT,
-				param.size >> IOMMU_PAGE_SHIFT);
+				param.iova >> IOMMU_PAGE_SHIFT_4K,
+				param.size >> IOMMU_PAGE_SHIFT_4K);
 		iommu_flush_tce(tbl);
 
 		return ret;
-- 
cgit v1.2.3


From 3a553170d35d69bea3877bffa508489dfa6f133d Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 9 Dec 2013 18:17:02 +1100
Subject: powerpc/iommu: Add it_page_shift field to determine iommu page size

This patch adds a it_page_shift field to struct iommu_table and
initiliases it to 4K for all platforms.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/iommu.h       |  1 +
 arch/powerpc/kernel/vio.c              |  5 +++--
 arch/powerpc/platforms/cell/iommu.c    |  8 +++++---
 arch/powerpc/platforms/pasemi/iommu.c  |  5 ++++-
 arch/powerpc/platforms/powernv/pci.c   |  3 ++-
 arch/powerpc/platforms/pseries/iommu.c | 10 ++++++----
 arch/powerpc/platforms/wsp/wsp_pci.c   |  5 +++--
 7 files changed, 24 insertions(+), 13 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 0869c7e74421..7c928342f3f5 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -76,6 +76,7 @@ struct iommu_table {
 	struct iommu_pool large_pool;
 	struct iommu_pool pools[IOMMU_NR_POOLS];
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
+	unsigned long  it_page_shift;/* table iommu page size */
 #ifdef CONFIG_IOMMU_API
 	struct iommu_group *it_group;
 #endif
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 2e89fa350763..170ac24a0106 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -1177,9 +1177,10 @@ static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev)
 			    &tbl->it_index, &offset, &size);
 
 	/* TCE table size - measured in tce entries */
-	tbl->it_size = size >> IOMMU_PAGE_SHIFT_4K;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+	tbl->it_size = size >> tbl->it_page_shift;
 	/* offset for VIO should always be 0 */
-	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT_4K;
+	tbl->it_offset = offset >> tbl->it_page_shift;
 	tbl->it_busno = 0;
 	tbl->it_type = TCE_VB;
 	tbl->it_blocksize = 16;
diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c
index fc61b908eaf0..2b90ff8a93be 100644
--- a/arch/powerpc/platforms/cell/iommu.c
+++ b/arch/powerpc/platforms/cell/iommu.c
@@ -197,7 +197,7 @@ static int tce_build_cell(struct iommu_table *tbl, long index, long npages,
 
 	io_pte = (unsigned long *)tbl->it_base + (index - tbl->it_offset);
 
-	for (i = 0; i < npages; i++, uaddr += IOMMU_PAGE_SIZE_4K)
+	for (i = 0; i < npages; i++, uaddr += tbl->it_page_shift)
 		io_pte[i] = base_pte | (__pa(uaddr) & CBE_IOPTE_RPN_Mask);
 
 	mb();
@@ -487,8 +487,10 @@ cell_iommu_setup_window(struct cbe_iommu *iommu, struct device_node *np,
 	window->table.it_blocksize = 16;
 	window->table.it_base = (unsigned long)iommu->ptab;
 	window->table.it_index = iommu->nid;
-	window->table.it_offset = (offset >> IOMMU_PAGE_SHIFT_4K) + pte_offset;
-	window->table.it_size = size >> IOMMU_PAGE_SHIFT_4K;
+	window->table.it_page_shift = IOMMU_PAGE_SHIFT_4K;
+	window->table.it_offset =
+		(offset >> window->table.it_page_shift) + pte_offset;
+	window->table.it_size = size >> window->table.it_page_shift;
 
 	iommu_init_table(&window->table, iommu->nid);
 
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 7d2d036754b5..2e576f2ae442 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -138,8 +138,11 @@ static void iommu_table_iobmap_setup(void)
 	pr_debug(" -> %s\n", __func__);
 	iommu_table_iobmap.it_busno = 0;
 	iommu_table_iobmap.it_offset = 0;
+	iommu_table_iobmap.it_page_shift = IOBMAP_PAGE_SHIFT;
+
 	/* it_size is in number of entries */
-	iommu_table_iobmap.it_size = 0x80000000 >> IOBMAP_PAGE_SHIFT;
+	iommu_table_iobmap.it_size =
+		0x80000000 >> iommu_table_iobmap.it_page_shift;
 
 	/* Initialize the common IOMMU code */
 	iommu_table_iobmap.it_base = (unsigned long)iob_l2_base;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 7f4d857668a9..569b46423a93 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -564,7 +564,8 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 {
 	tbl->it_blocksize = 16;
 	tbl->it_base = (unsigned long)tce_mem;
-	tbl->it_offset = dma_offset >> IOMMU_PAGE_SHIFT_4K;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
+	tbl->it_offset = dma_offset >> tbl->it_page_shift;
 	tbl->it_index = 0;
 	tbl->it_size = tce_size >> 3;
 	tbl->it_busno = 0;
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 1b7531ce0c0c..e0299183ae54 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -486,9 +486,10 @@ static void iommu_table_setparms(struct pci_controller *phb,
 		memset((void *)tbl->it_base, 0, *sizep);
 
 	tbl->it_busno = phb->bus->number;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
 
 	/* Units of tce entries */
-	tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT_4K;
+	tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
 
 	/* Test if we are going over 2GB of DMA space */
 	if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) {
@@ -499,7 +500,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
 	phb->dma_window_base_cur += phb->dma_window_size;
 
 	/* Set the tce table size - measured in entries */
-	tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT_4K;
+	tbl->it_size = phb->dma_window_size >> tbl->it_page_shift;
 
 	tbl->it_index = 0;
 	tbl->it_blocksize = 16;
@@ -537,11 +538,12 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
 	of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size);
 
 	tbl->it_busno = phb->bus->number;
+	tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
 	tbl->it_base   = 0;
 	tbl->it_blocksize  = 16;
 	tbl->it_type = TCE_PCI;
-	tbl->it_offset = offset >> IOMMU_PAGE_SHIFT_4K;
-	tbl->it_size = size >> IOMMU_PAGE_SHIFT_4K;
+	tbl->it_offset = offset >> tbl->it_page_shift;
+	tbl->it_size = size >> tbl->it_page_shift;
 }
 
 static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
diff --git a/arch/powerpc/platforms/wsp/wsp_pci.c b/arch/powerpc/platforms/wsp/wsp_pci.c
index 8a589618551e..9a15e5b39bb8 100644
--- a/arch/powerpc/platforms/wsp/wsp_pci.c
+++ b/arch/powerpc/platforms/wsp/wsp_pci.c
@@ -381,8 +381,9 @@ static struct wsp_dma_table *wsp_pci_create_dma32_table(struct wsp_phb *phb,
 
 	/* Init bits and pieces */
 	tbl->table.it_blocksize = 16;
-	tbl->table.it_offset = addr >> IOMMU_PAGE_SHIFT_4K;
-	tbl->table.it_size = size >> IOMMU_PAGE_SHIFT_4K;
+	tbl->table.it_page_shift = IOMMU_PAGE_SHIFT_4K;
+	tbl->table.it_offset = addr >> tbl->table.it_page_shift;
+	tbl->table.it_size = size >> tbl->table.it_page_shift;
 
 	/*
 	 * It's already blank but we clear it anyway.
-- 
cgit v1.2.3


From d084775738b746648d4102337163a04534a02982 Mon Sep 17 00:00:00 2001
From: Alistair Popple <alistair@popple.id.au>
Date: Mon, 9 Dec 2013 18:17:03 +1100
Subject: powerpc/iommu: Update the generic code to use dynamic iommu page
 sizes

This patch updates the generic iommu backend code to use the
it_page_shift field to determine the iommu page size instead of
using hardcoded values.

Signed-off-by: Alistair Popple <alistair@popple.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/iommu.h     | 19 +++++---
 arch/powerpc/kernel/dma-iommu.c      |  4 +-
 arch/powerpc/kernel/iommu.c          | 88 +++++++++++++++++++-----------------
 arch/powerpc/kernel/vio.c            | 25 +++++++---
 arch/powerpc/platforms/powernv/pci.c |  2 -
 drivers/net/ethernet/ibm/ibmveth.c   | 15 +++---
 6 files changed, 88 insertions(+), 65 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 7c928342f3f5..f7a8036579b5 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -35,17 +35,14 @@
 #define IOMMU_PAGE_MASK_4K       (~((1 << IOMMU_PAGE_SHIFT_4K) - 1))
 #define IOMMU_PAGE_ALIGN_4K(addr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE_4K)
 
+#define IOMMU_PAGE_SIZE(tblptr) (ASM_CONST(1) << (tblptr)->it_page_shift)
+#define IOMMU_PAGE_MASK(tblptr) (~((1 << (tblptr)->it_page_shift) - 1))
+#define IOMMU_PAGE_ALIGN(addr, tblptr) _ALIGN_UP(addr, IOMMU_PAGE_SIZE(tblptr))
+
 /* Boot time flags */
 extern int iommu_is_off;
 extern int iommu_force_on;
 
-/* Pure 2^n version of get_order */
-static __inline__ __attribute_const__ int get_iommu_order(unsigned long size)
-{
-	return __ilog2((size - 1) >> IOMMU_PAGE_SHIFT_4K) + 1;
-}
-
-
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
  * of dma space we can get.  IOMAP_MAX_ORDER = 13
@@ -82,6 +79,14 @@ struct iommu_table {
 #endif
 };
 
+/* Pure 2^n version of get_order */
+static inline __attribute_const__
+int get_iommu_order(unsigned long size, struct iommu_table *tbl)
+{
+	return __ilog2((size - 1) >> tbl->it_page_shift) + 1;
+}
+
+
 struct scatterlist;
 
 static inline void set_iommu_table_base(struct device *dev, void *base)
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index 5cfe3dbfc4a0..54d0116256f7 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -83,10 +83,10 @@ static int dma_iommu_dma_supported(struct device *dev, u64 mask)
 		return 0;
 	}
 
-	if (tbl->it_offset > (mask >> IOMMU_PAGE_SHIFT_4K)) {
+	if (tbl->it_offset > (mask >> tbl->it_page_shift)) {
 		dev_info(dev, "Warning: IOMMU offset too big for device mask\n");
 		dev_info(dev, "mask: 0x%08llx, table offset: 0x%08lx\n",
-				mask, tbl->it_offset << IOMMU_PAGE_SHIFT_4K);
+				mask, tbl->it_offset << tbl->it_page_shift);
 		return 0;
 	} else
 		return 1;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index df4a7f1b7444..f58d8135aab2 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -251,14 +251,13 @@ again:
 
 	if (dev)
 		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-				      1 << IOMMU_PAGE_SHIFT_4K);
+				      1 << tbl->it_page_shift);
 	else
-		boundary_size = ALIGN(1UL << 32, 1 << IOMMU_PAGE_SHIFT_4K);
+		boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
 	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
 
-	n = iommu_area_alloc(tbl->it_map, limit, start, npages,
-			tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT_4K,
-			align_mask);
+	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
+			     boundary_size >> tbl->it_page_shift, align_mask);
 	if (n == -1) {
 		if (likely(pass == 0)) {
 			/* First try the pool from the start */
@@ -320,12 +319,12 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 		return DMA_ERROR_CODE;
 
 	entry += tbl->it_offset;	/* Offset into real TCE table */
-	ret = entry << IOMMU_PAGE_SHIFT_4K;	/* Set the return dma address */
+	ret = entry << tbl->it_page_shift;	/* Set the return dma address */
 
 	/* Put the TCEs in the HW table */
 	build_fail = ppc_md.tce_build(tbl, entry, npages,
-				(unsigned long)page & IOMMU_PAGE_MASK_4K,
-				direction, attrs);
+				      (unsigned long)page &
+				      IOMMU_PAGE_MASK(tbl), direction, attrs);
 
 	/* ppc_md.tce_build() only returns non-zero for transient errors.
 	 * Clean up the table bitmap in this case and return
@@ -352,7 +351,7 @@ static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
 {
 	unsigned long entry, free_entry;
 
-	entry = dma_addr >> IOMMU_PAGE_SHIFT_4K;
+	entry = dma_addr >> tbl->it_page_shift;
 	free_entry = entry - tbl->it_offset;
 
 	if (((free_entry + npages) > tbl->it_size) ||
@@ -401,7 +400,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	unsigned long flags;
 	struct iommu_pool *pool;
 
-	entry = dma_addr >> IOMMU_PAGE_SHIFT_4K;
+	entry = dma_addr >> tbl->it_page_shift;
 	free_entry = entry - tbl->it_offset;
 
 	pool = get_pool(tbl, free_entry);
@@ -468,13 +467,13 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		}
 		/* Allocate iommu entries for that segment */
 		vaddr = (unsigned long) sg_virt(s);
-		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE_4K);
+		npages = iommu_num_pages(vaddr, slen, IOMMU_PAGE_SIZE(tbl));
 		align = 0;
-		if (IOMMU_PAGE_SHIFT_4K < PAGE_SHIFT && slen >= PAGE_SIZE &&
+		if (tbl->it_page_shift < PAGE_SHIFT && slen >= PAGE_SIZE &&
 		    (vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT_4K;
+			align = PAGE_SHIFT - tbl->it_page_shift;
 		entry = iommu_range_alloc(dev, tbl, npages, &handle,
-					  mask >> IOMMU_PAGE_SHIFT_4K, align);
+					  mask >> tbl->it_page_shift, align);
 
 		DBG("  - vaddr: %lx, size: %lx\n", vaddr, slen);
 
@@ -489,16 +488,16 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 
 		/* Convert entry to a dma_addr_t */
 		entry += tbl->it_offset;
-		dma_addr = entry << IOMMU_PAGE_SHIFT_4K;
-		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK_4K);
+		dma_addr = entry << tbl->it_page_shift;
+		dma_addr |= (s->offset & ~IOMMU_PAGE_MASK(tbl));
 
 		DBG("  - %lu pages, entry: %lx, dma_addr: %lx\n",
 			    npages, entry, dma_addr);
 
 		/* Insert into HW table */
 		build_fail = ppc_md.tce_build(tbl, entry, npages,
-					vaddr & IOMMU_PAGE_MASK_4K,
-					direction, attrs);
+					      vaddr & IOMMU_PAGE_MASK(tbl),
+					      direction, attrs);
 		if(unlikely(build_fail))
 			goto failure;
 
@@ -559,9 +558,9 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		if (s->dma_length != 0) {
 			unsigned long vaddr, npages;
 
-			vaddr = s->dma_address & IOMMU_PAGE_MASK_4K;
+			vaddr = s->dma_address & IOMMU_PAGE_MASK(tbl);
 			npages = iommu_num_pages(s->dma_address, s->dma_length,
-						 IOMMU_PAGE_SIZE_4K);
+						 IOMMU_PAGE_SIZE(tbl));
 			__iommu_free(tbl, vaddr, npages);
 			s->dma_address = DMA_ERROR_CODE;
 			s->dma_length = 0;
@@ -592,7 +591,7 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		if (sg->dma_length == 0)
 			break;
 		npages = iommu_num_pages(dma_handle, sg->dma_length,
-					 IOMMU_PAGE_SIZE_4K);
+					 IOMMU_PAGE_SIZE(tbl));
 		__iommu_free(tbl, dma_handle, npages);
 		sg = sg_next(sg);
 	}
@@ -676,7 +675,7 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 		set_bit(0, tbl->it_map);
 
 	/* We only split the IOMMU table if we have 1GB or more of space */
-	if ((tbl->it_size << IOMMU_PAGE_SHIFT_4K) >= (1UL * 1024 * 1024 * 1024))
+	if ((tbl->it_size << tbl->it_page_shift) >= (1UL * 1024 * 1024 * 1024))
 		tbl->nr_pools = IOMMU_NR_POOLS;
 	else
 		tbl->nr_pools = 1;
@@ -768,16 +767,16 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 
 	vaddr = page_address(page) + offset;
 	uaddr = (unsigned long)vaddr;
-	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE_4K);
+	npages = iommu_num_pages(uaddr, size, IOMMU_PAGE_SIZE(tbl));
 
 	if (tbl) {
 		align = 0;
-		if (IOMMU_PAGE_SHIFT_4K < PAGE_SHIFT && size >= PAGE_SIZE &&
+		if (tbl->it_page_shift < PAGE_SHIFT && size >= PAGE_SIZE &&
 		    ((unsigned long)vaddr & ~PAGE_MASK) == 0)
-			align = PAGE_SHIFT - IOMMU_PAGE_SHIFT_4K;
+			align = PAGE_SHIFT - tbl->it_page_shift;
 
 		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
-					 mask >> IOMMU_PAGE_SHIFT_4K, align,
+					 mask >> tbl->it_page_shift, align,
 					 attrs);
 		if (dma_handle == DMA_ERROR_CODE) {
 			if (printk_ratelimit())  {
@@ -786,7 +785,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
 					 npages);
 			}
 		} else
-			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK_4K);
+			dma_handle |= (uaddr & ~IOMMU_PAGE_MASK(tbl));
 	}
 
 	return dma_handle;
@@ -801,7 +800,8 @@ void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle,
 	BUG_ON(direction == DMA_NONE);
 
 	if (tbl) {
-		npages = iommu_num_pages(dma_handle, size, IOMMU_PAGE_SIZE_4K);
+		npages = iommu_num_pages(dma_handle, size,
+					 IOMMU_PAGE_SIZE(tbl));
 		iommu_free(tbl, dma_handle, npages);
 	}
 }
@@ -845,10 +845,10 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
 	memset(ret, 0, size);
 
 	/* Set up tces to cover the allocated range */
-	nio_pages = size >> IOMMU_PAGE_SHIFT_4K;
-	io_order = get_iommu_order(size);
+	nio_pages = size >> tbl->it_page_shift;
+	io_order = get_iommu_order(size, tbl);
 	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
-			      mask >> IOMMU_PAGE_SHIFT_4K, io_order, NULL);
+			      mask >> tbl->it_page_shift, io_order, NULL);
 	if (mapping == DMA_ERROR_CODE) {
 		free_pages((unsigned long)ret, order);
 		return NULL;
@@ -864,7 +864,7 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		unsigned int nio_pages;
 
 		size = PAGE_ALIGN(size);
-		nio_pages = size >> IOMMU_PAGE_SHIFT_4K;
+		nio_pages = size >> tbl->it_page_shift;
 		iommu_free(tbl, dma_handle, nio_pages);
 		size = PAGE_ALIGN(size);
 		free_pages((unsigned long)vaddr, get_order(size));
@@ -935,10 +935,10 @@ int iommu_tce_clear_param_check(struct iommu_table *tbl,
 	if (tce_value)
 		return -EINVAL;
 
-	if (ioba & ~IOMMU_PAGE_MASK_4K)
+	if (ioba & ~IOMMU_PAGE_MASK(tbl))
 		return -EINVAL;
 
-	ioba >>= IOMMU_PAGE_SHIFT_4K;
+	ioba >>= tbl->it_page_shift;
 	if (ioba < tbl->it_offset)
 		return -EINVAL;
 
@@ -955,13 +955,13 @@ int iommu_tce_put_param_check(struct iommu_table *tbl,
 	if (!(tce & (TCE_PCI_WRITE | TCE_PCI_READ)))
 		return -EINVAL;
 
-	if (tce & ~(IOMMU_PAGE_MASK_4K | TCE_PCI_WRITE | TCE_PCI_READ))
+	if (tce & ~(IOMMU_PAGE_MASK(tbl) | TCE_PCI_WRITE | TCE_PCI_READ))
 		return -EINVAL;
 
-	if (ioba & ~IOMMU_PAGE_MASK_4K)
+	if (ioba & ~IOMMU_PAGE_MASK(tbl))
 		return -EINVAL;
 
-	ioba >>= IOMMU_PAGE_SHIFT_4K;
+	ioba >>= tbl->it_page_shift;
 	if (ioba < tbl->it_offset)
 		return -EINVAL;
 
@@ -1037,7 +1037,7 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
 
 	/* if (unlikely(ret))
 		pr_err("iommu_tce: %s failed on hwaddr=%lx ioba=%lx kva=%lx ret=%d\n",
-				__func__, hwaddr, entry << IOMMU_PAGE_SHIFT_4K,
+			__func__, hwaddr, entry << IOMMU_PAGE_SHIFT(tbl),
 				hwaddr, ret); */
 
 	return ret;
@@ -1049,14 +1049,14 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
 {
 	int ret;
 	struct page *page = NULL;
-	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK_4K & ~PAGE_MASK;
+	unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
 	enum dma_data_direction direction = iommu_tce_direction(tce);
 
 	ret = get_user_pages_fast(tce & PAGE_MASK, 1,
 			direction != DMA_TO_DEVICE, &page);
 	if (unlikely(ret != 1)) {
 		/* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
-				tce, entry << IOMMU_PAGE_SHIFT_4K, ret); */
+				tce, entry << IOMMU_PAGE_SHIFT(tbl), ret); */
 		return -EFAULT;
 	}
 	hwaddr = (unsigned long) page_address(page) + offset;
@@ -1067,7 +1067,7 @@ int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
 
 	if (ret < 0)
 		pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
-			__func__, entry << IOMMU_PAGE_SHIFT_4K, tce, ret);
+			__func__, entry << tbl->it_page_shift, tce, ret);
 
 	return ret;
 }
@@ -1127,6 +1127,12 @@ int iommu_add_device(struct device *dev)
 	pr_debug("iommu_tce: adding %s to iommu group %d\n",
 			dev_name(dev), iommu_group_id(tbl->it_group));
 
+	if (PAGE_SIZE < IOMMU_PAGE_SIZE(tbl)) {
+		pr_err("iommu_tce: unsupported iommu page size.");
+		pr_err("%s has not been added\n", dev_name(dev));
+		return -EINVAL;
+	}
+
 	ret = iommu_group_add_device(tbl->it_group, dev);
 	if (ret < 0)
 		pr_err("iommu_tce: %s has not been added, ret=%d\n",
diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c
index 170ac24a0106..826d8bd9e522 100644
--- a/arch/powerpc/kernel/vio.c
+++ b/arch/powerpc/kernel/vio.c
@@ -518,16 +518,18 @@ static dma_addr_t vio_dma_iommu_map_page(struct device *dev, struct page *page,
                                          struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 	dma_addr_t ret = DMA_ERROR_CODE;
 
-	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE_4K))) {
+	tbl = get_iommu_table_base(dev);
+	if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)))) {
 		atomic_inc(&viodev->cmo.allocs_failed);
 		return ret;
 	}
 
 	ret = dma_iommu_ops.map_page(dev, page, offset, size, direction, attrs);
 	if (unlikely(dma_mapping_error(dev, ret))) {
-		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE_4K));
+		vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
 		atomic_inc(&viodev->cmo.allocs_failed);
 	}
 
@@ -540,10 +542,12 @@ static void vio_dma_iommu_unmap_page(struct device *dev, dma_addr_t dma_handle,
 				     struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 
+	tbl = get_iommu_table_base(dev);
 	dma_iommu_ops.unmap_page(dev, dma_handle, size, direction, attrs);
 
-	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE_4K));
+	vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE(tbl)));
 }
 
 static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -551,12 +555,14 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
                                 struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 	struct scatterlist *sgl;
 	int ret, count = 0;
 	size_t alloc_size = 0;
 
+	tbl = get_iommu_table_base(dev);
 	for (sgl = sglist; count < nelems; count++, sgl++)
-		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE_4K);
+		alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
 
 	if (vio_cmo_alloc(viodev, alloc_size)) {
 		atomic_inc(&viodev->cmo.allocs_failed);
@@ -572,7 +578,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist,
 	}
 
 	for (sgl = sglist, count = 0; count < ret; count++, sgl++)
-		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE_4K);
+		alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
 	if (alloc_size)
 		vio_cmo_dealloc(viodev, alloc_size);
 
@@ -585,12 +591,14 @@ static void vio_dma_iommu_unmap_sg(struct device *dev,
 		struct dma_attrs *attrs)
 {
 	struct vio_dev *viodev = to_vio_dev(dev);
+	struct iommu_table *tbl;
 	struct scatterlist *sgl;
 	size_t alloc_size = 0;
 	int count = 0;
 
+	tbl = get_iommu_table_base(dev);
 	for (sgl = sglist; count < nelems; count++, sgl++)
-		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE_4K);
+		alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE(tbl));
 
 	dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs);
 
@@ -706,11 +714,14 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev)
 {
 	struct vio_cmo_dev_entry *dev_ent;
 	struct device *dev = &viodev->dev;
+	struct iommu_table *tbl;
 	struct vio_driver *viodrv = to_vio_driver(dev->driver);
 	unsigned long flags;
 	size_t size;
 	bool dma_capable = false;
 
+	tbl = get_iommu_table_base(dev);
+
 	/* A device requires entitlement if it has a DMA window property */
 	switch (viodev->family) {
 	case VDEVICE:
@@ -737,7 +748,7 @@ static int vio_cmo_bus_probe(struct vio_dev *viodev)
 		}
 
 		viodev->cmo.desired =
-			IOMMU_PAGE_ALIGN_4K(viodrv->get_desired_dma(viodev));
+			IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev), tbl);
 		if (viodev->cmo.desired < VIO_CMO_MIN_ENT)
 			viodev->cmo.desired = VIO_CMO_MIN_ENT;
 		size = VIO_CMO_MIN_ENT;
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 569b46423a93..b555ebc57ef5 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -762,8 +762,6 @@ static struct notifier_block tce_iommu_bus_nb = {
 
 static int __init tce_iommu_bus_notifier_init(void)
 {
-	BUILD_BUG_ON(PAGE_SIZE < IOMMU_PAGE_SIZE_4K);
-
 	bus_register_notifier(&pci_bus_type, &tce_iommu_bus_nb);
 	return 0;
 }
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index f7d7538b6bd9..d04dbaba83dd 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -1276,31 +1276,34 @@ static unsigned long ibmveth_get_desired_dma(struct vio_dev *vdev)
 {
 	struct net_device *netdev = dev_get_drvdata(&vdev->dev);
 	struct ibmveth_adapter *adapter;
+	struct iommu_table *tbl;
 	unsigned long ret;
 	int i;
 	int rxqentries = 1;
 
+	tbl = get_iommu_table_base(&vdev->dev);
+
 	/* netdev inits at probe time along with the structures we need below*/
 	if (netdev == NULL)
-		return IOMMU_PAGE_ALIGN_4K(IBMVETH_IO_ENTITLEMENT_DEFAULT);
+		return IOMMU_PAGE_ALIGN(IBMVETH_IO_ENTITLEMENT_DEFAULT, tbl);
 
 	adapter = netdev_priv(netdev);
 
 	ret = IBMVETH_BUFF_LIST_SIZE + IBMVETH_FILT_LIST_SIZE;
-	ret += IOMMU_PAGE_ALIGN_4K(netdev->mtu);
+	ret += IOMMU_PAGE_ALIGN(netdev->mtu, tbl);
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
 		/* add the size of the active receive buffers */
 		if (adapter->rx_buff_pool[i].active)
 			ret +=
 			    adapter->rx_buff_pool[i].size *
-			    IOMMU_PAGE_ALIGN_4K(adapter->rx_buff_pool[i].
-			            buff_size);
+			    IOMMU_PAGE_ALIGN(adapter->rx_buff_pool[i].
+					     buff_size, tbl);
 		rxqentries += adapter->rx_buff_pool[i].size;
 	}
 	/* add the size of the receive queue entries */
-	ret += IOMMU_PAGE_ALIGN_4K(
-		rxqentries * sizeof(struct ibmveth_rx_q_entry));
+	ret += IOMMU_PAGE_ALIGN(
+		rxqentries * sizeof(struct ibmveth_rx_q_entry), tbl);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 228b1a473037c89d524e03a569c688a22241b4ea Mon Sep 17 00:00:00 2001
From: Mihai Caraman <mihai.caraman@freescale.com>
Date: Thu, 8 Aug 2013 15:56:09 +0300
Subject: powerpc/booke64: Add LRAT error exception handler

LRAT (Logical to Real Address Translation) present in MMU v2 provides hardware
translation from a logical page number (LPN) to a real page number (RPN) when
tlbwe is executed by a guest or when a page table translation occurs from a
guest virtual address.

Add LRAT error exception handler to Booke3E 64-bit kernel and the basic KVM
handler to avoid build breakage. This is a prerequisite for KVM LRAT support
that will follow.

Signed-off-by: Mihai Caraman <mihai.caraman@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/kvm_asm.h        |  1 +
 arch/powerpc/include/asm/reg_booke.h      |  1 +
 arch/powerpc/kernel/cpu_setup_fsl_booke.S | 12 ++++++++++++
 arch/powerpc/kernel/exceptions-64e.S      | 17 +++++++++++++++++
 arch/powerpc/kvm/bookehv_interrupts.S     |  2 ++
 5 files changed, 33 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 1bd92fd43cfb..1503d8c7c41b 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -74,6 +74,7 @@
 #define BOOKE_INTERRUPT_GUEST_DBELL_CRIT 39
 #define BOOKE_INTERRUPT_HV_SYSCALL 40
 #define BOOKE_INTERRUPT_HV_PRIV 41
+#define BOOKE_INTERRUPT_LRAT_ERROR 42
 
 /* book3s */
 
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2e31aacd8acc..1f7134dd0946 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -101,6 +101,7 @@
 #define SPRN_IVOR39	0x1B1	/* Interrupt Vector Offset Register 39 */
 #define SPRN_IVOR40	0x1B2	/* Interrupt Vector Offset Register 40 */
 #define SPRN_IVOR41	0x1B3	/* Interrupt Vector Offset Register 41 */
+#define SPRN_IVOR42	0x1B4	/* Interrupt Vector Offset Register 42 */
 #define SPRN_GIVOR2	0x1B8	/* Guest IVOR2 */
 #define SPRN_GIVOR3	0x1B9	/* Guest IVOR3 */
 #define SPRN_GIVOR4	0x1BA	/* Guest IVOR4 */
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
index bfb18c7290b7..fa6862db8a02 100644
--- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -57,6 +57,12 @@ _GLOBAL(__setup_cpu_e6500)
 	mflr	r6
 #ifdef CONFIG_PPC64
 	bl	.setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	.setup_lrat_ivor
+1:
 #endif
 	bl	__setup_cpu_e5500
 	mtlr	r6
@@ -119,6 +125,12 @@ _GLOBAL(__setup_cpu_e5500)
 _GLOBAL(__restore_cpu_e6500)
 	mflr	r5
 	bl	.setup_altivec_ivors
+	/* Touch IVOR42 only if the CPU supports E.HV category */
+	mfspr	r10,SPRN_MMUCFG
+	rlwinm.	r10,r10,0,MMUCFG_LPIDSIZE
+	beq	1f
+	bl	.setup_lrat_ivor
+1:
 	bl	__restore_cpu_e5500
 	mtlr	r5
 	blr
diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S
index e7751561fd1d..4d5a0b1034e8 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -308,6 +308,7 @@ interrupt_base_book3e:					/* fake trap */
 	EXCEPTION_STUB(0x2e0, guest_doorbell_crit)
 	EXCEPTION_STUB(0x300, hypercall)
 	EXCEPTION_STUB(0x320, ehpriv)
+	EXCEPTION_STUB(0x340, lrat_error)
 
 	.globl interrupt_end_book3e
 interrupt_end_book3e:
@@ -677,6 +678,17 @@ kernel_dbg_exc:
 	bl	.unknown_exception
 	b	.ret_from_except
 
+/* LRAT Error interrupt */
+	START_EXCEPTION(lrat_error);
+	NORMAL_EXCEPTION_PROLOG(0x340, BOOKE_INTERRUPT_LRAT_ERROR,
+			        PROLOG_ADDITION_NONE)
+	EXCEPTION_COMMON(0x340, PACA_EXGEN, INTS_KEEP)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.save_nvgprs
+	INTS_RESTORE_HARD
+	bl	.unknown_exception
+	b	.ret_from_except
+
 /*
  * An interrupt came in while soft-disabled; We mark paca->irq_happened
  * accordingly and if the interrupt is level sensitive, we hard disable
@@ -859,6 +871,7 @@ BAD_STACK_TRAMPOLINE(0x2e0)
 BAD_STACK_TRAMPOLINE(0x300)
 BAD_STACK_TRAMPOLINE(0x310)
 BAD_STACK_TRAMPOLINE(0x320)
+BAD_STACK_TRAMPOLINE(0x340)
 BAD_STACK_TRAMPOLINE(0x400)
 BAD_STACK_TRAMPOLINE(0x500)
 BAD_STACK_TRAMPOLINE(0x600)
@@ -1414,3 +1427,7 @@ _GLOBAL(setup_ehv_ivors)
 	SET_IVOR(38, 0x2c0) /* Guest Processor Doorbell */
 	SET_IVOR(39, 0x2e0) /* Guest Processor Doorbell Crit/MC */
 	blr
+
+_GLOBAL(setup_lrat_ivor)
+	SET_IVOR(42, 0x340) /* LRAT Error */
+	blr
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index e8ed7d659c55..a0d6929d8678 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -319,6 +319,8 @@ kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
 	SPRN_DSRR0, SPRN_DSRR1, 0
 kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
 	SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_LRAT_ERROR, EX_PARAMS(GEN), \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
 #else
 /*
  * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
-- 
cgit v1.2.3


From 640e922501103aaf2e0abb4cf4de5d49fa8342f7 Mon Sep 17 00:00:00 2001
From: Joseph Myers <joseph@codesourcery.com>
Date: Tue, 10 Dec 2013 23:07:45 +0000
Subject: powerpc: fix exception clearing in e500 SPE float emulation

The e500 SPE floating-point emulation code clears existing exceptions
(__FPU_FPSCR &= ~FP_EX_MASK;) before ORing in the exceptions from the
emulated operation.  However, these exception bits are the "sticky",
cumulative exception bits, and should only be cleared by the user
program setting SPEFSCR, not implicitly by any floating-point
instruction (whether executed purely by the hardware or emulated).
The spurious clearing of these bits shows up as missing exceptions in
glibc testing.

Fixing this, however, is not as simple as just not clearing the bits,
because while the bits may be from previous floating-point operations
(in which case they should not be cleared), the processor can also set
the sticky bits itself before the interrupt for an exception occurs,
and this can happen in cases when IEEE 754 semantics are that the
sticky bit should not be set.  Specifically, the "invalid" sticky bit
is set in various cases with non-finite operands, where IEEE 754
semantics do not involve raising such an exception, and the
"underflow" sticky bit is set in cases of exact underflow, whereas
IEEE 754 semantics are that this flag is set only for inexact
underflow.  Thus, for correct emulation the kernel needs to know the
setting of these two sticky bits before the instruction being
emulated.

When a floating-point operation raises an exception, the kernel can
note the state of the sticky bits immediately afterwards.  Some
<fenv.h> functions that affect the state of these bits, such as
fesetenv and feholdexcept, need to use prctl with PR_GET_FPEXC and
PR_SET_FPEXC anyway, and so it is natural to record the state of those
bits during that call into the kernel and so avoid any need for a
separate call into the kernel to inform it of a change to those bits.
Thus, the interface I chose to use (in this patch and the glibc port)
is that one of those prctl calls must be made after any userspace
change to those sticky bits, other than through a floating-point
operation that traps into the kernel anyway.  feclearexcept and
fesetexceptflag duly make those calls, which would not be required
were it not for this issue.

The previous EGLIBC port, and the uClibc code copied from it, is
fundamentally broken as regards any use of prctl for floating-point
exceptions because it didn't use the PR_FP_EXC_SW_ENABLE bit in its
prctl calls (and did various worse things, such as passing a pointer
when prctl expected an integer).  If you avoid anything where prctl is
used, the clearing of sticky bits still means it will never give
anything approximating correct exception semantics with existing
kernels.  I don't believe the patch makes things any worse for
existing code that doesn't try to inform the kernel of changes to
sticky bits - such code may get incorrect exceptions in some cases,
but it would have done so anyway in other cases.

Signed-off-by: Joseph Myers <joseph@codesourcery.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/processor.h |  6 +++++-
 arch/powerpc/kernel/process.c        | 30 ++++++++++++++++++++++++++++--
 arch/powerpc/math-emu/math_efp.c     | 20 +++++++++++++++++++-
 3 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index fc14a38c7ccf..91441d9cbaae 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -256,6 +256,8 @@ struct thread_struct {
 	unsigned long	evr[32];	/* upper 32-bits of SPE regs */
 	u64		acc;		/* Accumulator */
 	unsigned long	spefscr;	/* SPE & eFP status */
+	unsigned long	spefscr_last;	/* SPEFSCR value on last prctl
+					   call or trap return */
 	int		used_spe;	/* set if process has used spe */
 #endif /* CONFIG_SPE */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
@@ -317,7 +319,9 @@ struct thread_struct {
 	(_ALIGN_UP(sizeof(init_thread_info), 16) + (unsigned long) &init_stack)
 
 #ifdef CONFIG_SPE
-#define SPEFSCR_INIT .spefscr = SPEFSCR_FINVE | SPEFSCR_FDBZE | SPEFSCR_FUNFE | SPEFSCR_FOVFE,
+#define SPEFSCR_INIT \
+	.spefscr = SPEFSCR_FINVE | SPEFSCR_FDBZE | SPEFSCR_FUNFE | SPEFSCR_FOVFE, \
+	.spefscr_last = SPEFSCR_FINVE | SPEFSCR_FDBZE | SPEFSCR_FUNFE | SPEFSCR_FOVFE,
 #else
 #define SPEFSCR_INIT
 #endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 3386d8ab7eb0..b08c0d03530f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1175,6 +1175,19 @@ int set_fpexc_mode(struct task_struct *tsk, unsigned int val)
 	if (val & PR_FP_EXC_SW_ENABLE) {
 #ifdef CONFIG_SPE
 		if (cpu_has_feature(CPU_FTR_SPE)) {
+			/*
+			 * When the sticky exception bits are set
+			 * directly by userspace, it must call prctl
+			 * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+			 * in the existing prctl settings) or
+			 * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+			 * the bits being set).  <fenv.h> functions
+			 * saving and restoring the whole
+			 * floating-point environment need to do so
+			 * anyway to restore the prctl settings from
+			 * the saved environment.
+			 */
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
 			tsk->thread.fpexc_mode = val &
 				(PR_FP_EXC_SW_ENABLE | PR_FP_ALL_EXCEPT);
 			return 0;
@@ -1206,9 +1219,22 @@ int get_fpexc_mode(struct task_struct *tsk, unsigned long adr)
 
 	if (tsk->thread.fpexc_mode & PR_FP_EXC_SW_ENABLE)
 #ifdef CONFIG_SPE
-		if (cpu_has_feature(CPU_FTR_SPE))
+		if (cpu_has_feature(CPU_FTR_SPE)) {
+			/*
+			 * When the sticky exception bits are set
+			 * directly by userspace, it must call prctl
+			 * with PR_GET_FPEXC (with PR_FP_EXC_SW_ENABLE
+			 * in the existing prctl settings) or
+			 * PR_SET_FPEXC (with PR_FP_EXC_SW_ENABLE in
+			 * the bits being set).  <fenv.h> functions
+			 * saving and restoring the whole
+			 * floating-point environment need to do so
+			 * anyway to restore the prctl settings from
+			 * the saved environment.
+			 */
+			tsk->thread.spefscr_last = mfspr(SPRN_SPEFSCR);
 			val = tsk->thread.fpexc_mode;
-		else
+		} else
 			return -EINVAL;
 #else
 		return -EINVAL;
diff --git a/arch/powerpc/math-emu/math_efp.c b/arch/powerpc/math-emu/math_efp.c
index a73f0884d358..59835c625dc6 100644
--- a/arch/powerpc/math-emu/math_efp.c
+++ b/arch/powerpc/math-emu/math_efp.c
@@ -630,9 +630,27 @@ update_ccr:
 	regs->ccr |= (IR << ((7 - ((speinsn >> 23) & 0x7)) << 2));
 
 update_regs:
-	__FPU_FPSCR &= ~FP_EX_MASK;
+	/*
+	 * If the "invalid" exception sticky bit was set by the
+	 * processor for non-finite input, but was not set before the
+	 * instruction being emulated, clear it.  Likewise for the
+	 * "underflow" bit, which may have been set by the processor
+	 * for exact underflow, not just inexact underflow when the
+	 * flag should be set for IEEE 754 semantics.  Other sticky
+	 * exceptions will only be set by the processor when they are
+	 * correct according to IEEE 754 semantics, and we must not
+	 * clear sticky bits that were already set before the emulated
+	 * instruction as they represent the user-visible sticky
+	 * exception status.  "inexact" traps to kernel are not
+	 * required for IEEE semantics and are not enabled by default,
+	 * so the "inexact" sticky bit may have been set by a previous
+	 * instruction without the kernel being aware of it.
+	 */
+	__FPU_FPSCR
+	  &= ~(FP_EX_INVALID | FP_EX_UNDERFLOW) | current->thread.spefscr_last;
 	__FPU_FPSCR |= (FP_CUR_EXCEPTIONS & FP_EX_MASK);
 	mtspr(SPRN_SPEFSCR, __FPU_FPSCR);
+	current->thread.spefscr_last = __FPU_FPSCR;
 
 	current->thread.evr[fc] = vc.wp[0];
 	regs->gpr[fc] = vc.wp[1];
-- 
cgit v1.2.3


From 71a6fa17e1526d3f26f4711cc55d339f96c49a95 Mon Sep 17 00:00:00 2001
From: Wang Dongsheng <dongsheng.wang@freescale.com>
Date: Tue, 17 Dec 2013 16:16:59 +0800
Subject: powerpc/fsl: add E6500 PVR and SPRN_PWRMGTCR0 define

E6500 PVR and SPRN_PWRMGTCR0 will be used in subsequent pw20/altivec
idle patches.

Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/reg.h       | 2 ++
 arch/powerpc/include/asm/reg_booke.h | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index fa8388ed94c5..62b114e079cf 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1075,6 +1075,8 @@
 #define PVR_8560	0x80200000
 #define PVR_VER_E500V1	0x8020
 #define PVR_VER_E500V2	0x8021
+#define PVR_VER_E6500	0x8040
+
 /*
  * For the 8xx processors, all of them report the same PVR family for
  * the PowerPC core. The various versions of these processors must be
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 1f7134dd0946..163c3b05a76e 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -171,6 +171,7 @@
 #define SPRN_L2CSR1	0x3FA	/* L2 Data Cache Control and Status Register 1 */
 #define SPRN_DCCR	0x3FA	/* Data Cache Cacheability Register */
 #define SPRN_ICCR	0x3FB	/* Instruction Cache Cacheability Register */
+#define SPRN_PWRMGTCR0	0x3FB	/* Power management control register 0 */
 #define SPRN_SVR	0x3FF	/* System Version Register */
 
 /*
@@ -217,6 +218,14 @@
 #define	CCR1_DPC	0x00000100 /* Disable L1 I-Cache/D-Cache parity checking */
 #define	CCR1_TCS	0x00000080 /* Timer Clock Select */
 
+/* Bit definitions for PWRMGTCR0. */
+#define PWRMGTCR0_PW20_WAIT		(1 << 14) /* PW20 state enable bit */
+#define PWRMGTCR0_PW20_ENT_SHIFT	8
+#define PWRMGTCR0_PW20_ENT		0x3F00
+#define PWRMGTCR0_AV_IDLE_PD_EN		(1 << 22) /* Altivec idle enable */
+#define PWRMGTCR0_AV_IDLE_CNT_SHIFT	16
+#define PWRMGTCR0_AV_IDLE_CNT		0x3F0000
+
 /* Bit definitions for the MCSR. */
 #define MCSR_MCS	0x80000000 /* Machine Check Summary */
 #define MCSR_IB		0x40000000 /* Instruction PLB Error */
-- 
cgit v1.2.3


From 1820a8d2163554dcd0272cc095338499db4a4dc3 Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Tue, 8 Oct 2013 09:32:19 +0530
Subject: kvm/powerpc: rename kvm_hypercall() to epapr_hypercall()

kvm_hypercall() have nothing KVM specific, so renamed to epapr_hypercall().
Also this in moved to arch/powerpc/include/asm/epapr_hcalls.h

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/epapr_hcalls.h | 46 +++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_para.h     | 23 +++++------------
 arch/powerpc/kernel/kvm.c               | 41 ++---------------------------
 3 files changed, 54 insertions(+), 56 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index 86b0ac79990c..eec87f9494bc 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -460,5 +460,51 @@ static inline unsigned int ev_idle(void)
 
 	return r3;
 }
+
+#ifdef CONFIG_EPAPR_PARAVIRT
+static inline unsigned long epapr_hypercall(unsigned long *in,
+			    unsigned long *out,
+			    unsigned long nr)
+{
+	unsigned long register r0 asm("r0");
+	unsigned long register r3 asm("r3") = in[0];
+	unsigned long register r4 asm("r4") = in[1];
+	unsigned long register r5 asm("r5") = in[2];
+	unsigned long register r6 asm("r6") = in[3];
+	unsigned long register r7 asm("r7") = in[4];
+	unsigned long register r8 asm("r8") = in[5];
+	unsigned long register r9 asm("r9") = in[6];
+	unsigned long register r10 asm("r10") = in[7];
+	unsigned long register r11 asm("r11") = nr;
+	unsigned long register r12 asm("r12");
+
+	asm volatile("bl	epapr_hypercall_start"
+		     : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
+		       "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
+		       "=r"(r12)
+		     : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
+		       "r"(r9), "r"(r10), "r"(r11)
+		     : "memory", "cc", "xer", "ctr", "lr");
+
+	out[0] = r4;
+	out[1] = r5;
+	out[2] = r6;
+	out[3] = r7;
+	out[4] = r8;
+	out[5] = r9;
+	out[6] = r10;
+	out[7] = r11;
+
+	return r3;
+}
+#else
+static unsigned long epapr_hypercall(unsigned long *in,
+				   unsigned long *out,
+				   unsigned long nr)
+{
+	return EV_UNIMPLEMENTED;
+}
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 2b119654b4c1..c18660ef26d8 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -39,10 +39,6 @@ static inline int kvm_para_available(void)
 	return 1;
 }
 
-extern unsigned long kvm_hypercall(unsigned long *in,
-				   unsigned long *out,
-				   unsigned long nr);
-
 #else
 
 static inline int kvm_para_available(void)
@@ -50,13 +46,6 @@ static inline int kvm_para_available(void)
 	return 0;
 }
 
-static unsigned long kvm_hypercall(unsigned long *in,
-				   unsigned long *out,
-				   unsigned long nr)
-{
-	return EV_UNIMPLEMENTED;
-}
-
 #endif
 
 static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
@@ -65,7 +54,7 @@ static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
 	unsigned long out[8];
 	unsigned long r;
 
-	r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
+	r = epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 	*r2 = out[0];
 
 	return r;
@@ -76,7 +65,7 @@ static inline long kvm_hypercall0(unsigned int nr)
 	unsigned long in[8];
 	unsigned long out[8];
 
-	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
+	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
@@ -85,7 +74,7 @@ static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
 	unsigned long out[8];
 
 	in[0] = p1;
-	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
+	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
@@ -96,7 +85,7 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
 
 	in[0] = p1;
 	in[1] = p2;
-	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
+	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
@@ -108,7 +97,7 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
 	in[0] = p1;
 	in[1] = p2;
 	in[2] = p3;
-	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
+	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
@@ -122,7 +111,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 	in[1] = p2;
 	in[2] = p3;
 	in[3] = p4;
-	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
+	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index db28032e320e..6a0175297b0d 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -413,13 +413,13 @@ static void kvm_map_magic_page(void *data)
 {
 	u32 *features = data;
 
-	ulong in[8];
+	ulong in[8] = {0};
 	ulong out[8];
 
 	in[0] = KVM_MAGIC_PAGE;
 	in[1] = KVM_MAGIC_PAGE;
 
-	kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
+	epapr_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
 
 	*features = out[0];
 }
@@ -711,43 +711,6 @@ static void kvm_use_magic_page(void)
 			 kvm_patching_worked ? "worked" : "failed");
 }
 
-unsigned long kvm_hypercall(unsigned long *in,
-			    unsigned long *out,
-			    unsigned long nr)
-{
-	unsigned long register r0 asm("r0");
-	unsigned long register r3 asm("r3") = in[0];
-	unsigned long register r4 asm("r4") = in[1];
-	unsigned long register r5 asm("r5") = in[2];
-	unsigned long register r6 asm("r6") = in[3];
-	unsigned long register r7 asm("r7") = in[4];
-	unsigned long register r8 asm("r8") = in[5];
-	unsigned long register r9 asm("r9") = in[6];
-	unsigned long register r10 asm("r10") = in[7];
-	unsigned long register r11 asm("r11") = nr;
-	unsigned long register r12 asm("r12");
-
-	asm volatile("bl	epapr_hypercall_start"
-		     : "=r"(r0), "=r"(r3), "=r"(r4), "=r"(r5), "=r"(r6),
-		       "=r"(r7), "=r"(r8), "=r"(r9), "=r"(r10), "=r"(r11),
-		       "=r"(r12)
-		     : "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7), "r"(r8),
-		       "r"(r9), "r"(r10), "r"(r11)
-		     : "memory", "cc", "xer", "ctr", "lr");
-
-	out[0] = r4;
-	out[1] = r5;
-	out[2] = r6;
-	out[3] = r7;
-	out[4] = r8;
-	out[5] = r9;
-	out[6] = r10;
-	out[7] = r11;
-
-	return r3;
-}
-EXPORT_SYMBOL_GPL(kvm_hypercall);
-
 static __init void kvm_free_tmp(void)
 {
 	free_reserved_area(&kvm_tmp[kvm_tmp_index],
-- 
cgit v1.2.3


From b1f0d94c26b64e814243b736f47e7ef40d96432c Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Tue, 8 Oct 2013 09:32:20 +0530
Subject: kvm/powerpc: move kvm_hypercall0() and friends to epapr_hypercall0()

kvm_hypercall0() and friends have nothing KVM specific so moved to
epapr_hypercall0() and friends. Also they are moved from
arch/powerpc/include/asm/kvm_para.h to arch/powerpc/include/asm/epapr_hcalls.h

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/epapr_hcalls.h | 65 +++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_para.h     | 69 +--------------------------------
 2 files changed, 66 insertions(+), 68 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index eec87f9494bc..334459ad145b 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -506,5 +506,70 @@ static unsigned long epapr_hypercall(unsigned long *in,
 }
 #endif
 
+static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+	unsigned long r;
+
+	r = epapr_hypercall(in, out, nr);
+	*r2 = out[0];
+
+	return r;
+}
+
+static inline long epapr_hypercall0(unsigned int nr)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	return epapr_hypercall(in, out, nr);
+}
+
+static inline long epapr_hypercall1(unsigned int nr, unsigned long p1)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	return epapr_hypercall(in, out, nr);
+}
+
+static inline long epapr_hypercall2(unsigned int nr, unsigned long p1,
+				    unsigned long p2)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	return epapr_hypercall(in, out, nr);
+}
+
+static inline long epapr_hypercall3(unsigned int nr, unsigned long p1,
+				    unsigned long p2, unsigned long p3)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	in[2] = p3;
+	return epapr_hypercall(in, out, nr);
+}
+
+static inline long epapr_hypercall4(unsigned int nr, unsigned long p1,
+				    unsigned long p2, unsigned long p3,
+				    unsigned long p4)
+{
+	unsigned long in[8];
+	unsigned long out[8];
+
+	in[0] = p1;
+	in[1] = p2;
+	in[2] = p3;
+	in[3] = p4;
+	return epapr_hypercall(in, out, nr);
+}
 #endif /* !__ASSEMBLY__ */
 #endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index c18660ef26d8..336a91acb8b1 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -48,73 +48,6 @@ static inline int kvm_para_available(void)
 
 #endif
 
-static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
-{
-	unsigned long in[8];
-	unsigned long out[8];
-	unsigned long r;
-
-	r = epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
-	*r2 = out[0];
-
-	return r;
-}
-
-static inline long kvm_hypercall0(unsigned int nr)
-{
-	unsigned long in[8];
-	unsigned long out[8];
-
-	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
-}
-
-static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
-{
-	unsigned long in[8];
-	unsigned long out[8];
-
-	in[0] = p1;
-	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
-}
-
-static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
-				  unsigned long p2)
-{
-	unsigned long in[8];
-	unsigned long out[8];
-
-	in[0] = p1;
-	in[1] = p2;
-	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
-}
-
-static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
-				  unsigned long p2, unsigned long p3)
-{
-	unsigned long in[8];
-	unsigned long out[8];
-
-	in[0] = p1;
-	in[1] = p2;
-	in[2] = p3;
-	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
-}
-
-static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
-				  unsigned long p2, unsigned long p3,
-				  unsigned long p4)
-{
-	unsigned long in[8];
-	unsigned long out[8];
-
-	in[0] = p1;
-	in[1] = p2;
-	in[2] = p3;
-	in[3] = p4;
-	return epapr_hypercall(in, out, KVM_HCALL_TOKEN(nr));
-}
-
-
 static inline unsigned int kvm_arch_para_features(void)
 {
 	unsigned long r;
@@ -122,7 +55,7 @@ static inline unsigned int kvm_arch_para_features(void)
 	if (!kvm_para_available())
 		return 0;
 
-	if(kvm_hypercall0_1(KVM_HC_FEATURES, &r))
+	if(epapr_hypercall0_1(KVM_HCALL_TOKEN(KVM_HC_FEATURES), &r))
 		return 0;
 
 	return r;
-- 
cgit v1.2.3


From 09548fdaf32ce77a68e7f9a8a3098c1306b04858 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 15 Oct 2013 20:43:01 +1100
Subject: KVM: PPC: Use load_fp/vr_state rather than load_up_fpu/altivec

The load_up_fpu and load_up_altivec functions were never intended to
be called from C, and do things like modifying the MSR value in their
callers' stack frames, which are assumed to be interrupt frames.  In
addition, on 32-bit Book S they require the MMU to be off.

This makes KVM use the new load_fp_state() and load_vr_state() functions
instead of load_up_fpu/altivec.  This means we can remove the assembler
glue in book3s_rmhandlers.S, and potentially fixes a bug on Book E,
where load_up_fpu was called directly from C.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h |  3 ---
 arch/powerpc/include/asm/switch_to.h  |  2 --
 arch/powerpc/kvm/book3s_exports.c     |  4 ---
 arch/powerpc/kvm/book3s_pr.c          | 18 +++++++++-----
 arch/powerpc/kvm/book3s_rmhandlers.S  | 47 -----------------------------------
 arch/powerpc/kvm/booke.h              |  3 ++-
 6 files changed, 14 insertions(+), 63 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 4a594b76674d..8bb870694616 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -186,9 +186,6 @@ extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr,
 
 extern void kvmppc_entry_trampoline(void);
 extern void kvmppc_hv_entry_trampoline(void);
-extern void kvmppc_load_up_fpu(void);
-extern void kvmppc_load_up_altivec(void);
-extern void kvmppc_load_up_vsx(void);
 extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst);
 extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst);
 extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd);
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 9ee12610af02..971ca336f451 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -25,10 +25,8 @@ static inline void save_tar(struct thread_struct *prev)
 static inline void save_tar(struct thread_struct *prev) {}
 #endif
 
-extern void load_up_fpu(void);
 extern void enable_kernel_fp(void);
 extern void enable_kernel_altivec(void);
-extern void load_up_altivec(struct task_struct *);
 extern int emulate_altivec(struct pt_regs *);
 extern void __giveup_vsx(struct task_struct *);
 extern void giveup_vsx(struct task_struct *);
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index 852989a9bad3..20d4ea8e656d 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -25,9 +25,5 @@ EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline);
 #endif
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline);
-EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
-#ifdef CONFIG_ALTIVEC
-EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
-#endif
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 21bf7c5c9545..d63a91f825d3 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -691,7 +691,8 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
 		t->fp_state.fpscr = vcpu->arch.fpscr;
 		t->fpexc_mode = 0;
-		kvmppc_load_up_fpu();
+		enable_kernel_fp();
+		load_fp_state(&t->fp_state);
 	}
 
 	if (msr & MSR_VEC) {
@@ -699,7 +700,8 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 		memcpy(t->vr_state.vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
 		t->vr_state.vscr = vcpu->arch.vscr;
 		t->vrsave = -1;
-		kvmppc_load_up_altivec();
+		enable_kernel_altivec();
+		load_vr_state(&t->vr_state);
 #endif
 	}
 
@@ -722,11 +724,15 @@ static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu)
 	if (!lost_ext)
 		return;
 
-	if (lost_ext & MSR_FP)
-		kvmppc_load_up_fpu();
+	if (lost_ext & MSR_FP) {
+		enable_kernel_fp();
+		load_fp_state(&current->thread.fp_state);
+	}
 #ifdef CONFIG_ALTIVEC
-	if (lost_ext & MSR_VEC)
-		kvmppc_load_up_altivec();
+	if (lost_ext & MSR_VEC) {
+		enable_kernel_altivec();
+		load_vr_state(&current->thread.vr_state);
+	}
 #endif
 	current->thread.regs->msr |= lost_ext;
 }
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index a38c4c9edab8..c78ffbc371a5 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -166,51 +166,4 @@ _GLOBAL(kvmppc_entry_trampoline)
 	mtsrr1	r6
 	RFI
 
-#if defined(CONFIG_PPC_BOOK3S_32)
-#define STACK_LR	INT_FRAME_SIZE+4
-
-/* load_up_xxx have to run with MSR_DR=0 on Book3S_32 */
-#define MSR_EXT_START						\
-	PPC_STL	r20, _NIP(r1);					\
-	mfmsr	r20;						\
-	LOAD_REG_IMMEDIATE(r3, MSR_DR|MSR_EE);			\
-	andc	r3,r20,r3;		/* Disable DR,EE */	\
-	mtmsr	r3;						\
-	sync
-
-#define MSR_EXT_END						\
-	mtmsr	r20;			/* Enable DR,EE */	\
-	sync;							\
-	PPC_LL	r20, _NIP(r1)
-
-#elif defined(CONFIG_PPC_BOOK3S_64)
-#define STACK_LR	_LINK
-#define MSR_EXT_START
-#define MSR_EXT_END
-#endif
-
-/*
- * Activate current's external feature (FPU/Altivec/VSX)
- */
-#define define_load_up(what) 					\
-								\
-_GLOBAL(kvmppc_load_up_ ## what);				\
-	PPC_STLU r1, -INT_FRAME_SIZE(r1);			\
-	mflr	r3;						\
-	PPC_STL	r3, STACK_LR(r1);				\
-	MSR_EXT_START;						\
-								\
-	bl	FUNC(load_up_ ## what);				\
-								\
-	MSR_EXT_END;						\
-	PPC_LL	r3, STACK_LR(r1);				\
-	mtlr	r3;						\
-	addi	r1, r1, INT_FRAME_SIZE;				\
-	blr
-
-define_load_up(fpu)
-#ifdef CONFIG_ALTIVEC
-define_load_up(altivec)
-#endif
-
 #include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 09bfd9bc7cf8..fe59f225327f 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -136,7 +136,8 @@ static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_PPC_FPU
 	if (vcpu->fpu_active && !(current->thread.regs->msr & MSR_FP)) {
-		load_up_fpu();
+		enable_kernel_fp();
+		load_fp_state(&current->thread.fp_state);
 		current->thread.regs->msr |= MSR_FP;
 	}
 #endif
-- 
cgit v1.2.3


From efff19122315f1431f6b02cd2983b15f5d3957bd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 15 Oct 2013 20:43:02 +1100
Subject: KVM: PPC: Store FP/VSX/VMX state in thread_fp/vr_state structures

This uses struct thread_fp_state and struct thread_vr_state to store
the floating-point, VMX/Altivec and VSX state, rather than flat arrays.
This makes transferring the state to/from the thread_struct simpler
and allows us to unify the get/set_one_reg implementations for the
VSX registers.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h      |  12 +--
 arch/powerpc/kernel/asm-offsets.c        |  11 +-
 arch/powerpc/kvm/book3s.c                |  38 +++++--
 arch/powerpc/kvm/book3s_hv.c             |  42 --------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |   4 +-
 arch/powerpc/kvm/book3s_paired_singles.c | 169 +++++++++++++++----------------
 arch/powerpc/kvm/book3s_pr.c             |  63 +-----------
 arch/powerpc/kvm/booke.c                 |   8 +-
 arch/powerpc/kvm/powerpc.c               |   4 +-
 9 files changed, 131 insertions(+), 220 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 237d1d25b448..3ca0b430eaee 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -410,8 +410,7 @@ struct kvm_vcpu_arch {
 
 	ulong gpr[32];
 
-	u64 fpr[32];
-	u64 fpscr;
+	struct thread_fp_state fp;
 
 #ifdef CONFIG_SPE
 	ulong evr[32];
@@ -420,12 +419,7 @@ struct kvm_vcpu_arch {
 	u64 acc;
 #endif
 #ifdef CONFIG_ALTIVEC
-	vector128 vr[32];
-	vector128 vscr;
-#endif
-
-#ifdef CONFIG_VSX
-	u64 vsr[64];
+	struct thread_vr_state vr;
 #endif
 
 #ifdef CONFIG_KVM_BOOKE_HV
@@ -619,6 +613,8 @@ struct kvm_vcpu_arch {
 #endif
 };
 
+#define VCPU_FPR(vcpu, i)	(vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
+
 /* Values for vcpu->arch.state */
 #define KVMPPC_VCPU_NOTREADY		0
 #define KVMPPC_VCPU_RUNNABLE		1
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 2ea5cc033ec8..8403f9031d93 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -425,14 +425,11 @@ int main(void)
 	DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
-	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
-	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fpscr));
+	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fp.fpr));
+	DEFINE(VCPU_FPSCR, offsetof(struct kvm_vcpu, arch.fp.fpscr));
 #ifdef CONFIG_ALTIVEC
-	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr));
-	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vscr));
-#endif
-#ifdef CONFIG_VSX
-	DEFINE(VCPU_VSRS, offsetof(struct kvm_vcpu, arch.vsr));
+	DEFINE(VCPU_VRS, offsetof(struct kvm_vcpu, arch.vr.vr));
+	DEFINE(VCPU_VSCR, offsetof(struct kvm_vcpu, arch.vr.vscr));
 #endif
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
 	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 48cf91bc862f..94e597e6f15c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -577,10 +577,10 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 			break;
 		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
 			i = reg->id - KVM_REG_PPC_FPR0;
-			val = get_reg_val(reg->id, vcpu->arch.fpr[i]);
+			val = get_reg_val(reg->id, VCPU_FPR(vcpu, i));
 			break;
 		case KVM_REG_PPC_FPSCR:
-			val = get_reg_val(reg->id, vcpu->arch.fpscr);
+			val = get_reg_val(reg->id, vcpu->arch.fp.fpscr);
 			break;
 #ifdef CONFIG_ALTIVEC
 		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
@@ -588,19 +588,30 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 				r = -ENXIO;
 				break;
 			}
-			val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0];
+			val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0];
 			break;
 		case KVM_REG_PPC_VSCR:
 			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
 				r = -ENXIO;
 				break;
 			}
-			val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
+			val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]);
 			break;
 		case KVM_REG_PPC_VRSAVE:
 			val = get_reg_val(reg->id, vcpu->arch.vrsave);
 			break;
 #endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_VSX
+		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+			if (cpu_has_feature(CPU_FTR_VSX)) {
+				long int i = reg->id - KVM_REG_PPC_VSR0;
+				val.vsxval[0] = vcpu->arch.fp.fpr[i][0];
+				val.vsxval[1] = vcpu->arch.fp.fpr[i][1];
+			} else {
+				r = -ENXIO;
+			}
+			break;
+#endif /* CONFIG_VSX */
 		case KVM_REG_PPC_DEBUG_INST: {
 			u32 opcode = INS_TW;
 			r = copy_to_user((u32 __user *)(long)reg->addr,
@@ -656,10 +667,10 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 			break;
 		case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
 			i = reg->id - KVM_REG_PPC_FPR0;
-			vcpu->arch.fpr[i] = set_reg_val(reg->id, val);
+			VCPU_FPR(vcpu, i) = set_reg_val(reg->id, val);
 			break;
 		case KVM_REG_PPC_FPSCR:
-			vcpu->arch.fpscr = set_reg_val(reg->id, val);
+			vcpu->arch.fp.fpscr = set_reg_val(reg->id, val);
 			break;
 #ifdef CONFIG_ALTIVEC
 		case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
@@ -667,14 +678,14 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 				r = -ENXIO;
 				break;
 			}
-			vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
+			vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
 			break;
 		case KVM_REG_PPC_VSCR:
 			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
 				r = -ENXIO;
 				break;
 			}
-			vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
+			vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val);
 			break;
 		case KVM_REG_PPC_VRSAVE:
 			if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
@@ -684,6 +695,17 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
 			vcpu->arch.vrsave = set_reg_val(reg->id, val);
 			break;
 #endif /* CONFIG_ALTIVEC */
+#ifdef CONFIG_VSX
+		case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+			if (cpu_has_feature(CPU_FTR_VSX)) {
+				long int i = reg->id - KVM_REG_PPC_VSR0;
+				vcpu->arch.fp.fpr[i][0] = val.vsxval[0];
+				vcpu->arch.fp.fpr[i][1] = val.vsxval[1];
+			} else {
+				r = -ENXIO;
+			}
+			break;
+#endif /* CONFIG_VSX */
 #ifdef CONFIG_KVM_XICS
 		case KVM_REG_PPC_ICP_STATE:
 			if (!vcpu->arch.icp) {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 088a6e54c998..461f55566167 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -811,27 +811,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_SDAR:
 		*val = get_reg_val(id, vcpu->arch.sdar);
 		break;
-#ifdef CONFIG_VSX
-	case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
-		if (cpu_has_feature(CPU_FTR_VSX)) {
-			/* VSX => FP reg i is stored in arch.vsr[2*i] */
-			long int i = id - KVM_REG_PPC_FPR0;
-			*val = get_reg_val(id, vcpu->arch.vsr[2 * i]);
-		} else {
-			/* let generic code handle it */
-			r = -EINVAL;
-		}
-		break;
-	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
-		if (cpu_has_feature(CPU_FTR_VSX)) {
-			long int i = id - KVM_REG_PPC_VSR0;
-			val->vsxval[0] = vcpu->arch.vsr[2 * i];
-			val->vsxval[1] = vcpu->arch.vsr[2 * i + 1];
-		} else {
-			r = -ENXIO;
-		}
-		break;
-#endif /* CONFIG_VSX */
 	case KVM_REG_PPC_VPA_ADDR:
 		spin_lock(&vcpu->arch.vpa_update_lock);
 		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
@@ -914,27 +893,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_SDAR:
 		vcpu->arch.sdar = set_reg_val(id, *val);
 		break;
-#ifdef CONFIG_VSX
-	case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
-		if (cpu_has_feature(CPU_FTR_VSX)) {
-			/* VSX => FP reg i is stored in arch.vsr[2*i] */
-			long int i = id - KVM_REG_PPC_FPR0;
-			vcpu->arch.vsr[2 * i] = set_reg_val(id, *val);
-		} else {
-			/* let generic code handle it */
-			r = -EINVAL;
-		}
-		break;
-	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
-		if (cpu_has_feature(CPU_FTR_VSX)) {
-			long int i = id - KVM_REG_PPC_VSR0;
-			vcpu->arch.vsr[2 * i] = val->vsxval[0];
-			vcpu->arch.vsr[2 * i + 1] = val->vsxval[1];
-		} else {
-			r = -ENXIO;
-		}
-		break;
-#endif /* CONFIG_VSX */
 	case KVM_REG_PPC_VPA_ADDR:
 		addr = set_reg_val(id, *val);
 		r = -EINVAL;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index d5ddc2d10748..47fd536fb13b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1889,7 +1889,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 BEGIN_FTR_SECTION
 	reg = 0
 	.rept	32
-	li	r6,reg*16+VCPU_VSRS
+	li	r6,reg*16+VCPU_FPRS
 	STXVD2X(reg,R6,R3)
 	reg = reg + 1
 	.endr
@@ -1951,7 +1951,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 BEGIN_FTR_SECTION
 	reg = 0
 	.rept	32
-	li	r7,reg*16+VCPU_VSRS
+	li	r7,reg*16+VCPU_FPRS
 	LXVD2X(reg,R7,R4)
 	reg = reg + 1
 	.endr
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index a59a25a13218..c1abd95063f4 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -160,7 +160,7 @@
 
 static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 {
-	kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt]);
+	kvm_cvt_df(&VCPU_FPR(vcpu, rt), &vcpu->arch.qpr[rt]);
 }
 
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
@@ -207,11 +207,11 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* put in registers */
 	switch (ls_type) {
 	case FPU_LS_SINGLE:
-		kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs]);
+		kvm_cvt_fd((u32*)tmp, &VCPU_FPR(vcpu, rs));
 		vcpu->arch.qpr[rs] = *((u32*)tmp);
 		break;
 	case FPU_LS_DOUBLE:
-		vcpu->arch.fpr[rs] = *((u64*)tmp);
+		VCPU_FPR(vcpu, rs) = *((u64*)tmp);
 		break;
 	}
 
@@ -233,18 +233,18 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	switch (ls_type) {
 	case FPU_LS_SINGLE:
-		kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp);
+		kvm_cvt_df(&VCPU_FPR(vcpu, rs), (u32*)tmp);
 		val = *((u32*)tmp);
 		len = sizeof(u32);
 		break;
 	case FPU_LS_SINGLE_LOW:
-		*((u32*)tmp) = vcpu->arch.fpr[rs];
-		val = vcpu->arch.fpr[rs] & 0xffffffff;
+		*((u32*)tmp) = VCPU_FPR(vcpu, rs);
+		val = VCPU_FPR(vcpu, rs) & 0xffffffff;
 		len = sizeof(u32);
 		break;
 	case FPU_LS_DOUBLE:
-		*((u64*)tmp) = vcpu->arch.fpr[rs];
-		val = vcpu->arch.fpr[rs];
+		*((u64*)tmp) = VCPU_FPR(vcpu, rs);
+		val = VCPU_FPR(vcpu, rs);
 		len = sizeof(u64);
 		break;
 	default:
@@ -301,7 +301,7 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	emulated = EMULATE_DONE;
 
 	/* put in registers */
-	kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs]);
+	kvm_cvt_fd(&tmp[0], &VCPU_FPR(vcpu, rs));
 	vcpu->arch.qpr[rs] = tmp[1];
 
 	dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
@@ -319,7 +319,7 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	u32 tmp[2];
 	int len = w ? sizeof(u32) : sizeof(u64);
 
-	kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0]);
+	kvm_cvt_df(&VCPU_FPR(vcpu, rs), &tmp[0]);
 	tmp[1] = vcpu->arch.qpr[rs];
 
 	r = kvmppc_st(vcpu, &addr, len, tmp, true);
@@ -512,7 +512,6 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 						 u32 *src2, u32 *src3))
 {
 	u32 *qpr = vcpu->arch.qpr;
-	u64 *fpr = vcpu->arch.fpr;
 	u32 ps0_out;
 	u32 ps0_in1, ps0_in2, ps0_in3;
 	u32 ps1_in1, ps1_in2, ps1_in3;
@@ -521,20 +520,20 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 	WARN_ON(rc);
 
 	/* PS0 */
-	kvm_cvt_df(&fpr[reg_in1], &ps0_in1);
-	kvm_cvt_df(&fpr[reg_in2], &ps0_in2);
-	kvm_cvt_df(&fpr[reg_in3], &ps0_in3);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in3), &ps0_in3);
 
 	if (scalar & SCALAR_LOW)
 		ps0_in2 = qpr[reg_in2];
 
-	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
 
 	dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 			  ps0_in1, ps0_in2, ps0_in3, ps0_out);
 
 	if (!(scalar & SCALAR_NO_PS0))
-		kvm_cvt_fd(&ps0_out, &fpr[reg_out]);
+		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
 
 	/* PS1 */
 	ps1_in1 = qpr[reg_in1];
@@ -545,7 +544,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 		ps1_in2 = ps0_in2;
 
 	if (!(scalar & SCALAR_NO_PS1))
-		func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
+		func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
 
 	dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 			  ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
@@ -561,7 +560,6 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 						 u32 *src2))
 {
 	u32 *qpr = vcpu->arch.qpr;
-	u64 *fpr = vcpu->arch.fpr;
 	u32 ps0_out;
 	u32 ps0_in1, ps0_in2;
 	u32 ps1_out;
@@ -571,20 +569,20 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 	WARN_ON(rc);
 
 	/* PS0 */
-	kvm_cvt_df(&fpr[reg_in1], &ps0_in1);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1);
 
 	if (scalar & SCALAR_LOW)
 		ps0_in2 = qpr[reg_in2];
 	else
-		kvm_cvt_df(&fpr[reg_in2], &ps0_in2);
+		kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2);
 
-	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
 
 	if (!(scalar & SCALAR_NO_PS0)) {
 		dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
 				  ps0_in1, ps0_in2, ps0_out);
 
-		kvm_cvt_fd(&ps0_out, &fpr[reg_out]);
+		kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
 	}
 
 	/* PS1 */
@@ -594,7 +592,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 	if (scalar & SCALAR_HIGH)
 		ps1_in2 = ps0_in2;
 
-	func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
+	func(&vcpu->arch.fp.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
 
 	if (!(scalar & SCALAR_NO_PS1)) {
 		qpr[reg_out] = ps1_out;
@@ -612,7 +610,6 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
 						 u32 *dst, u32 *src1))
 {
 	u32 *qpr = vcpu->arch.qpr;
-	u64 *fpr = vcpu->arch.fpr;
 	u32 ps0_out, ps0_in;
 	u32 ps1_in;
 
@@ -620,17 +617,17 @@ static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
 	WARN_ON(rc);
 
 	/* PS0 */
-	kvm_cvt_df(&fpr[reg_in], &ps0_in);
-	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in);
+	kvm_cvt_df(&VCPU_FPR(vcpu, reg_in), &ps0_in);
+	func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in);
 
 	dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
 			  ps0_in, ps0_out);
 
-	kvm_cvt_fd(&ps0_out, &fpr[reg_out]);
+	kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out));
 
 	/* PS1 */
 	ps1_in = qpr[reg_in];
-	func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in);
+	func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in);
 
 	dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
 			  ps1_in, qpr[reg_out]);
@@ -649,10 +646,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	int ax_rc = inst_get_field(inst, 21, 25);
 	short full_d = inst_get_field(inst, 16, 31);
 
-	u64 *fpr_d = &vcpu->arch.fpr[ax_rd];
-	u64 *fpr_a = &vcpu->arch.fpr[ax_ra];
-	u64 *fpr_b = &vcpu->arch.fpr[ax_rb];
-	u64 *fpr_c = &vcpu->arch.fpr[ax_rc];
+	u64 *fpr_d = &VCPU_FPR(vcpu, ax_rd);
+	u64 *fpr_a = &VCPU_FPR(vcpu, ax_ra);
+	u64 *fpr_b = &VCPU_FPR(vcpu, ax_rb);
+	u64 *fpr_c = &VCPU_FPR(vcpu, ax_rc);
 
 	bool rcomp = (inst & 1) ? true : false;
 	u32 cr = kvmppc_get_cr(vcpu);
@@ -674,11 +671,11 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	/* Do we need to clear FE0 / FE1 here? Don't think so. */
 
 #ifdef DEBUG
-	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
+	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {
 		u32 f;
-		kvm_cvt_df(&vcpu->arch.fpr[i], &f);
+		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);
 		dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n",
-			i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
+			i, f, VCPU_FPR(vcpu, i), i, vcpu->arch.qpr[i]);
 	}
 #endif
 
@@ -764,8 +761,8 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 		}
 		case OP_4X_PS_NEG:
-			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
-			vcpu->arch.fpr[ax_rd] ^= 0x8000000000000000ULL;
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) ^= 0x8000000000000000ULL;
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			vcpu->arch.qpr[ax_rd] ^= 0x80000000;
 			break;
@@ -775,7 +772,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 		case OP_4X_PS_MR:
 			WARN_ON(rcomp);
-			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			break;
 		case OP_4X_PS_CMPO1:
@@ -784,44 +781,44 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 		case OP_4X_PS_NABS:
 			WARN_ON(rcomp);
-			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
-			vcpu->arch.fpr[ax_rd] |= 0x8000000000000000ULL;
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) |= 0x8000000000000000ULL;
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			vcpu->arch.qpr[ax_rd] |= 0x80000000;
 			break;
 		case OP_4X_PS_ABS:
 			WARN_ON(rcomp);
-			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rb];
-			vcpu->arch.fpr[ax_rd] &= ~0x8000000000000000ULL;
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb);
+			VCPU_FPR(vcpu, ax_rd) &= ~0x8000000000000000ULL;
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			vcpu->arch.qpr[ax_rd] &= ~0x80000000;
 			break;
 		case OP_4X_PS_MERGE00:
 			WARN_ON(rcomp);
-			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
-			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-			kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra);
+			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */
+			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),
 				   &vcpu->arch.qpr[ax_rd]);
 			break;
 		case OP_4X_PS_MERGE01:
 			WARN_ON(rcomp);
-			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra);
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			break;
 		case OP_4X_PS_MERGE10:
 			WARN_ON(rcomp);
-			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
+			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */
 			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
-				   &vcpu->arch.fpr[ax_rd]);
-			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-			kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+				   &VCPU_FPR(vcpu, ax_rd));
+			/* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */
+			kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb),
 				   &vcpu->arch.qpr[ax_rd]);
 			break;
 		case OP_4X_PS_MERGE11:
 			WARN_ON(rcomp);
-			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
+			/* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */
 			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
-				   &vcpu->arch.fpr[ax_rd]);
+				   &VCPU_FPR(vcpu, ax_rd));
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			break;
 		}
@@ -856,7 +853,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		case OP_4A_PS_SUM1:
 			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
 					ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds);
-			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_rc];
+			VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rc);
 			break;
 		case OP_4A_PS_SUM0:
 			emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd,
@@ -1106,45 +1103,45 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	case 59:
 		switch (inst_get_field(inst, 21, 30)) {
 		case OP_59_FADDS:
-			fpd_fadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			fpd_fadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FSUBS:
-			fpd_fsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			fpd_fsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FDIVS:
-			fpd_fdivs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			fpd_fdivs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FRES:
-			fpd_fres(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_fres(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FRSQRTES:
-			fpd_frsqrtes(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_frsqrtes(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		}
 		switch (inst_get_field(inst, 26, 30)) {
 		case OP_59_FMULS:
-			fpd_fmuls(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+			fpd_fmuls(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FMSUBS:
-			fpd_fmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FMADDS:
-			fpd_fmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FNMSUBS:
-			fpd_fnmsubs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fnmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_59_FNMADDS:
-			fpd_fnmadds(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fnmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		}
@@ -1159,12 +1156,12 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 		case OP_63_MFFS:
 			/* XXX missing CR */
-			*fpr_d = vcpu->arch.fpscr;
+			*fpr_d = vcpu->arch.fp.fpscr;
 			break;
 		case OP_63_MTFSF:
 			/* XXX missing fm bits */
 			/* XXX missing CR */
-			vcpu->arch.fpscr = *fpr_b;
+			vcpu->arch.fp.fpscr = *fpr_b;
 			break;
 		case OP_63_FCMPU:
 		{
@@ -1172,7 +1169,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			u32 cr0_mask = 0xf0000000;
 			u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
 
-			fpd_fcmpu(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b);
+			fpd_fcmpu(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);
 			cr &= ~(cr0_mask >> cr_shift);
 			cr |= (cr & cr0_mask) >> cr_shift;
 			break;
@@ -1183,40 +1180,40 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			u32 cr0_mask = 0xf0000000;
 			u32 cr_shift = inst_get_field(inst, 6, 8) * 4;
 
-			fpd_fcmpo(&vcpu->arch.fpscr, &tmp_cr, fpr_a, fpr_b);
+			fpd_fcmpo(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b);
 			cr &= ~(cr0_mask >> cr_shift);
 			cr |= (cr & cr0_mask) >> cr_shift;
 			break;
 		}
 		case OP_63_FNEG:
-			fpd_fneg(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_fneg(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			break;
 		case OP_63_FMR:
 			*fpr_d = *fpr_b;
 			break;
 		case OP_63_FABS:
-			fpd_fabs(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_fabs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			break;
 		case OP_63_FCPSGN:
-			fpd_fcpsgn(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			fpd_fcpsgn(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
 			break;
 		case OP_63_FDIV:
-			fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
 			break;
 		case OP_63_FADD:
-			fpd_fadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			fpd_fadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
 			break;
 		case OP_63_FSUB:
-			fpd_fsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_b);
+			fpd_fsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b);
 			break;
 		case OP_63_FCTIW:
-			fpd_fctiw(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_fctiw(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			break;
 		case OP_63_FCTIWZ:
-			fpd_fctiwz(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_fctiwz(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			break;
 		case OP_63_FRSP:
-			fpd_frsp(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_frsp(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			kvmppc_sync_qpr(vcpu, ax_rd);
 			break;
 		case OP_63_FRSQRTE:
@@ -1224,39 +1221,39 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			double one = 1.0f;
 
 			/* fD = sqrt(fB) */
-			fpd_fsqrt(&vcpu->arch.fpscr, &cr, fpr_d, fpr_b);
+			fpd_fsqrt(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b);
 			/* fD = 1.0f / fD */
-			fpd_fdiv(&vcpu->arch.fpscr, &cr, fpr_d, (u64*)&one, fpr_d);
+			fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, (u64*)&one, fpr_d);
 			break;
 		}
 		}
 		switch (inst_get_field(inst, 26, 30)) {
 		case OP_63_FMUL:
-			fpd_fmul(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c);
+			fpd_fmul(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c);
 			break;
 		case OP_63_FSEL:
-			fpd_fsel(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fsel(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			break;
 		case OP_63_FMSUB:
-			fpd_fmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			break;
 		case OP_63_FMADD:
-			fpd_fmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			break;
 		case OP_63_FNMSUB:
-			fpd_fnmsub(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fnmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			break;
 		case OP_63_FNMADD:
-			fpd_fnmadd(&vcpu->arch.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
+			fpd_fnmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b);
 			break;
 		}
 		break;
 	}
 
 #ifdef DEBUG
-	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
+	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) {
 		u32 f;
-		kvm_cvt_df(&vcpu->arch.fpr[i], &f);
+		kvm_cvt_df(&VCPU_FPR(vcpu, i), &f);
 		dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
 	}
 #endif
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index d63a91f825d3..2bb425b22461 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -545,12 +545,6 @@ static inline int get_fpr_index(int i)
 void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 {
 	struct thread_struct *t = &current->thread;
-	u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-	u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-	u64 *thread_fpr = &t->fp_state.fpr[0][0];
-	int i;
 
 	/*
 	 * VSX instructions can access FP and vector registers, so if
@@ -575,24 +569,14 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
 		 */
 		if (current->thread.regs->msr & MSR_FP)
 			giveup_fpu(current);
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-			vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
-
-		vcpu->arch.fpscr = t->fp_state.fpscr;
-
-#ifdef CONFIG_VSX
-		if (cpu_has_feature(CPU_FTR_VSX))
-			for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
-				vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
-#endif
+		vcpu->arch.fp = t->fp_state;
 	}
 
 #ifdef CONFIG_ALTIVEC
 	if (msr & MSR_VEC) {
 		if (current->thread.regs->msr & MSR_VEC)
 			giveup_altivec(current);
-		memcpy(vcpu->arch.vr, t->vr_state.vr, sizeof(vcpu->arch.vr));
-		vcpu->arch.vscr = t->vr_state.vscr;
+		vcpu->arch.vr = t->vr_state;
 	}
 #endif
 
@@ -640,12 +624,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 			     ulong msr)
 {
 	struct thread_struct *t = &current->thread;
-	u64 *vcpu_fpr = vcpu->arch.fpr;
-#ifdef CONFIG_VSX
-	u64 *vcpu_vsx = vcpu->arch.vsr;
-#endif
-	u64 *thread_fpr = &t->fp_state.fpr[0][0];
-	int i;
 
 	/* When we have paired singles, we emulate in software */
 	if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)
@@ -683,13 +661,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 #endif
 
 	if (msr & MSR_FP) {
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
-			thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
-#ifdef CONFIG_VSX
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
-			thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
-#endif
-		t->fp_state.fpscr = vcpu->arch.fpscr;
+		t->fp_state = vcpu->arch.fp;
 		t->fpexc_mode = 0;
 		enable_kernel_fp();
 		load_fp_state(&t->fp_state);
@@ -697,8 +669,7 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
 
 	if (msr & MSR_VEC) {
 #ifdef CONFIG_ALTIVEC
-		memcpy(t->vr_state.vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
-		t->vr_state.vscr = vcpu->arch.vscr;
+		t->vr_state = vcpu->arch.vr;
 		t->vrsave = -1;
 		enable_kernel_altivec();
 		load_vr_state(&t->vr_state);
@@ -1118,19 +1089,6 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_HIOR:
 		*val = get_reg_val(id, to_book3s(vcpu)->hior);
 		break;
-#ifdef CONFIG_VSX
-	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
-		long int i = id - KVM_REG_PPC_VSR0;
-
-		if (!cpu_has_feature(CPU_FTR_VSX)) {
-			r = -ENXIO;
-			break;
-		}
-		val->vsxval[0] = vcpu->arch.fpr[i];
-		val->vsxval[1] = vcpu->arch.vsr[i];
-		break;
-	}
-#endif /* CONFIG_VSX */
 	default:
 		r = -EINVAL;
 		break;
@@ -1149,19 +1107,6 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 		to_book3s(vcpu)->hior = set_reg_val(id, *val);
 		to_book3s(vcpu)->hior_explicit = true;
 		break;
-#ifdef CONFIG_VSX
-	case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
-		long int i = id - KVM_REG_PPC_VSR0;
-
-		if (!cpu_has_feature(CPU_FTR_VSX)) {
-			r = -ENXIO;
-			break;
-		}
-		vcpu->arch.fpr[i] = val->vsxval[0];
-		vcpu->arch.vsr[i] = val->vsxval[1];
-		break;
-	}
-#endif /* CONFIG_VSX */
 	default:
 		r = -EINVAL;
 		break;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 53e65a210b9a..0033465ecc3f 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -707,9 +707,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	fpexc_mode = current->thread.fpexc_mode;
 
 	/* Restore guest FPU state to thread */
-	memcpy(current->thread.fp_state.fpr, vcpu->arch.fpr,
-	       sizeof(vcpu->arch.fpr));
-	current->thread.fp_state.fpscr = vcpu->arch.fpscr;
+	current->thread.fp_state = vcpu->arch.fp;
 
 	/*
 	 * Since we can't trap on MSR_FP in GS-mode, we consider the guest
@@ -745,9 +743,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	vcpu->fpu_active = 0;
 
 	/* Save guest FPU state from thread */
-	memcpy(vcpu->arch.fpr, current->thread.fp_state.fpr,
-	       sizeof(vcpu->arch.fpr));
-	vcpu->arch.fpscr = current->thread.fp_state.fpscr;
+	vcpu->arch.fp = current->thread.fp_state;
 
 	/* Restore userspace FPU state from stack */
 	current->thread.fp_state = fp;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9ae97686e9f4..7ca9e0a80499 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -656,14 +656,14 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 		kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr);
 		break;
 	case KVM_MMIO_REG_FPR:
-		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
+		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
 		break;
 #ifdef CONFIG_PPC_BOOK3S
 	case KVM_MMIO_REG_QPR:
 		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
 	case KVM_MMIO_REG_FQPR:
-		vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
+		VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr;
 		vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr;
 		break;
 #endif
-- 
cgit v1.2.3


From f5e3fe091f5238459752a81b478398b7cb22e575 Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Fri, 15 Nov 2013 11:01:15 +0530
Subject: kvm: powerpc: define a linux pte lookup function

We need to search linux "pte" to get "pte" attributes for setting TLB in KVM.
This patch defines a lookup_linux_ptep() function which returns pte pointer.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Reviewed-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/pgtable.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 7d6eacf249cf..b60ceb8d86c8 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -223,6 +223,27 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 #endif
 pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 				 unsigned *shift);
+
+static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
+				     unsigned long *pte_sizep)
+{
+	pte_t *ptep;
+	unsigned long ps = *pte_sizep;
+	unsigned int shift;
+
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
+	if (!ptep)
+		return NULL;
+	if (shift)
+		*pte_sizep = 1ul << shift;
+	else
+		*pte_sizep = PAGE_SIZE;
+
+	if (ps > *pte_sizep)
+		return NULL;
+
+	return ptep;
+}
 #endif /* __ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
-- 
cgit v1.2.3


From 08c9a188d0d0fc0f0c5e17d89a06bb59c493110f Mon Sep 17 00:00:00 2001
From: Bharat Bhushan <r65777@freescale.com>
Date: Mon, 18 Nov 2013 11:18:54 +0530
Subject: kvm: powerpc: use caching attributes as per linux pte

KVM uses same WIM tlb attributes as the corresponding qemu pte.
For this we now search the linux pte for the requested page and
get these cache caching/coherency attributes from pte.

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
Reviewed-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |  2 +-
 arch/powerpc/kvm/booke.c            |  1 +
 arch/powerpc/kvm/e500.h             |  8 ++++---
 arch/powerpc/kvm/e500_mmu_host.c    | 43 ++++++++++++++++++++++---------------
 4 files changed, 33 insertions(+), 21 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 3ca0b430eaee..2c2ca5faf7f2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -540,6 +540,7 @@ struct kvm_vcpu_arch {
 #endif
 	gpa_t paddr_accessed;
 	gva_t vaddr_accessed;
+	pgd_t *pgdir;
 
 	u8 io_gpr; /* GPR used as IO source/target */
 	u8 mmio_is_bigendian;
@@ -597,7 +598,6 @@ struct kvm_vcpu_arch {
 	struct list_head run_list;
 	struct task_struct *run_task;
 	struct kvm_run *kvm_run;
-	pgd_t *pgdir;
 
 	spinlock_t vpa_update_lock;
 	struct kvmppc_vpa vpa;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a983ccaf3cce..54ee1c01798f 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -717,6 +717,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	thread.debug = current->thread.debug;
 	current->thread.debug = vcpu->arch.shadow_dbg_reg;
 
+	vcpu->arch.pgdir = current->mm->pgd;
 	kvmppc_fix_ee_before_entry();
 
 	ret = __kvmppc_vcpu_run(kvm_run, vcpu);
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 4fd9650eb018..a326178bdea5 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -31,11 +31,13 @@ enum vcpu_ftr {
 #define E500_TLB_NUM   2
 
 /* entry is mapped somewhere in host TLB */
-#define E500_TLB_VALID		(1 << 0)
+#define E500_TLB_VALID		(1 << 31)
 /* TLB1 entry is mapped by host TLB1, tracked by bitmaps */
-#define E500_TLB_BITMAP		(1 << 1)
+#define E500_TLB_BITMAP		(1 << 30)
 /* TLB1 entry is mapped by host TLB0 */
-#define E500_TLB_TLB0		(1 << 2)
+#define E500_TLB_TLB0		(1 << 29)
+/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */
+#define E500_TLB_MAS2_ATTR	(0x7f)
 
 struct tlbe_ref {
 	pfn_t pfn;		/* valid only for TLB0, except briefly */
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index a45e25cd78fc..dd2cc03f406f 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -65,15 +65,6 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
 	return mas3;
 }
 
-static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
-{
-#ifdef CONFIG_SMP
-	return (mas2 & MAS2_ATTRIB_MASK) | MAS2_M;
-#else
-	return mas2 & MAS2_ATTRIB_MASK;
-#endif
-}
-
 /*
  * writing shadow tlb entry to host TLB
  */
@@ -249,11 +240,14 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe)
 
 static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
 					 struct kvm_book3e_206_tlb_entry *gtlbe,
-					 pfn_t pfn)
+					 pfn_t pfn, unsigned int wimg)
 {
 	ref->pfn = pfn;
 	ref->flags = E500_TLB_VALID;
 
+	/* Use guest supplied MAS2_G and MAS2_E */
+	ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg;
+
 	/* Mark the page accessed */
 	kvm_set_pfn_accessed(pfn);
 
@@ -316,8 +310,7 @@ static void kvmppc_e500_setup_stlbe(
 
 	/* Force IPROT=0 for all guest mappings. */
 	stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
-	stlbe->mas2 = (gvaddr & MAS2_EPN) |
-		      e500_shadow_mas2_attrib(gtlbe->mas2, pr);
+	stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR);
 	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
 			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
 
@@ -339,6 +332,10 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	int ret = 0;
 	unsigned long mmu_seq;
 	struct kvm *kvm = vcpu_e500->vcpu.kvm;
+	unsigned long tsize_pages = 0;
+	pte_t *ptep;
+	unsigned int wimg = 0;
+	pgd_t *pgdir;
 
 	/* used to check for invalidations in progress */
 	mmu_seq = kvm->mmu_notifier_seq;
@@ -405,7 +402,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 			 */
 
 			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
-				unsigned long gfn_start, gfn_end, tsize_pages;
+				unsigned long gfn_start, gfn_end;
 				tsize_pages = 1 << (tsize - 2);
 
 				gfn_start = gfn & ~(tsize_pages - 1);
@@ -447,11 +444,12 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 
 	if (likely(!pfnmap)) {
-		unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
+		tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
 		pfn = gfn_to_pfn_memslot(slot, gfn);
 		if (is_error_noslot_pfn(pfn)) {
-			printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
-					(long)gfn);
+			if (printk_ratelimit())
+				pr_err("%s: real page not found for gfn %lx\n",
+				       __func__, (long)gfn);
 			return -EINVAL;
 		}
 
@@ -466,7 +464,18 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		goto out;
 	}
 
-	kvmppc_e500_ref_setup(ref, gtlbe, pfn);
+
+	pgdir = vcpu_e500->vcpu.arch.pgdir;
+	ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages);
+	if (pte_present(*ptep))
+		wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
+	else {
+		if (printk_ratelimit())
+			pr_err("%s: pte not present: gfn %lx, pfn %lx\n",
+				__func__, (long)gfn, pfn);
+		return -EINVAL;
+	}
+	kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
 
 	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
 				ref, gvaddr, stlbe);
-- 
cgit v1.2.3


From 7a8ff56be68239bd36a2b639cb40bfbcfc58dad3 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 9 Jan 2014 11:10:44 +0100
Subject: KVM: PPC: Unify kvmppc_get_last_inst and sc

We had code duplication between the inline functions to get our last
instruction on normal interrupts and system call interrupts. Unify
both helper functions towards a single implementation.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 8bb870694616..f8b23201c105 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -264,10 +264,8 @@ static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
 	return vcpu->arch.pc;
 }
 
-static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong pc)
 {
-	ulong pc = kvmppc_get_pc(vcpu);
-
 	/* Load the instruction manually if it failed to do so in the
 	 * exit path */
 	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
@@ -276,6 +274,11 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
 	return vcpu->arch.last_inst;
 }
 
+static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
+{
+	return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu));
+}
+
 /*
  * Like kvmppc_get_last_inst(), but for fetching a sc instruction.
  * Because the sc instruction sets SRR0 to point to the following
@@ -283,14 +286,7 @@ static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
  */
 static inline u32 kvmppc_get_last_sc(struct kvm_vcpu *vcpu)
 {
-	ulong pc = kvmppc_get_pc(vcpu) - 4;
-
-	/* Load the instruction manually if it failed to do so in the
-	 * exit path */
-	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
-		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
-
-	return vcpu->arch.last_inst;
+	return kvmppc_get_last_inst_internal(vcpu, kvmppc_get_pc(vcpu) - 4);
 }
 
 static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 1c49abec677c7ff495837dbaafad8e12f09c29fc Mon Sep 17 00:00:00 2001
From: Kevin Hao <haokexin@gmail.com>
Date: Tue, 24 Dec 2013 15:12:05 +0800
Subject: powerpc: introduce macro LOAD_REG_ADDR_PIC

This is used to get the address of a variable when the kernel is not
running at the linked or relocated address.

Signed-off-by: Kevin Hao <haokexin@gmail.com>
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/ppc_asm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index f595b98079ee..1279c59624ed 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -295,6 +295,11 @@ n:
  *   you want to access various offsets within it).  On ppc32 this is
  *   identical to LOAD_REG_IMMEDIATE.
  *
+ * LOAD_REG_ADDR_PIC(rn, name)
+ *   Loads the address of label 'name' into register 'run'. Use this when
+ *   the kernel doesn't run at the linked or relocated address. Please
+ *   note that this macro will clobber the lr register.
+ *
  * LOAD_REG_ADDRBASE(rn, name)
  * ADDROFF(name)
  *   LOAD_REG_ADDRBASE loads part of the address of label 'name' into
@@ -305,6 +310,14 @@ n:
  *      LOAD_REG_ADDRBASE(rX, name)
  *      ld	rY,ADDROFF(name)(rX)
  */
+
+/* Be careful, this will clobber the lr register. */
+#define LOAD_REG_ADDR_PIC(reg, name)		\
+	bl	0f;				\
+0:	mflr	reg;				\
+	addis	reg,reg,(name - 0b)@ha;		\
+	addi	reg,reg,(name - 0b)@l;
+
 #ifdef __powerpc64__
 #define LOAD_REG_IMMEDIATE(reg,expr)		\
 	lis     reg,(expr)@highest;		\
-- 
cgit v1.2.3


From 28efc35fe68dacbddc4b12c2fa8f2df1593a4ad3 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 11 Oct 2013 19:22:38 -0500
Subject: powerpc/e6500: TLB miss handler with hardware tablewalk support

There are a few things that make the existing hw tablewalk handlers
unsuitable for e6500:

 - Indirect entries go in TLB1 (though the resulting direct entries go in
   TLB0).

 - It has threads, but no "tlbsrx." -- so we need a spinlock and
   a normal "tlbsx".  Because we need this lock, hardware tablewalk
   is mandatory on e6500 unless we want to add spinlock+tlbsx to
   the normal bolted TLB miss handler.

 - TLB1 has no HES (nor next-victim hint) so we need software round robin
   (TODO: integrate this round robin data with hugetlb/KVM)

 - The existing tablewalk handlers map half of a page table at a time,
   because IBM hardware has a fixed 1MiB indirect page size.  e6500
   has variable size indirect entries, with a minimum of 2MiB.
   So we can't do the half-page indirect mapping, and even if we
   could it would be less efficient than mapping the full page.

 - Like on e5500, the linear mapping is bolted, so we don't need the
   overhead of supporting nested tlb misses.

Note that hardware tablewalk does not work in rev1 of e6500.
We do not expect to support e6500 rev1 in mainline Linux.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Cc: Mihai Caraman <mihai.caraman@freescale.com>
---
 arch/powerpc/include/asm/mmu-book3e.h |  13 +++
 arch/powerpc/include/asm/mmu.h        |  21 +++--
 arch/powerpc/include/asm/paca.h       |   6 ++
 arch/powerpc/kernel/asm-offsets.c     |   9 ++
 arch/powerpc/kernel/paca.c            |   5 +
 arch/powerpc/kernel/setup_64.c        |  31 ++++++
 arch/powerpc/mm/fsl_booke_mmu.c       |   7 ++
 arch/powerpc/mm/mem.c                 |   6 ++
 arch/powerpc/mm/tlb_low_64e.S         | 171 ++++++++++++++++++++++++++++++++++
 arch/powerpc/mm/tlb_nohash.c          |  93 ++++++++++++------
 10 files changed, 326 insertions(+), 36 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index 936db360790a..89b785d16846 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -286,8 +286,21 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 extern int mmu_linear_psize;
 extern int mmu_vmemmap_psize;
 
+struct tlb_core_data {
+	/* For software way selection, as on Freescale TLB1 */
+	u8 esel_next, esel_max, esel_first;
+
+	/* Per-core spinlock for e6500 TLB handlers (no tlbsrx.) */
+	u8 lock;
+};
+
 #ifdef CONFIG_PPC64
 extern unsigned long linear_map_top;
+extern int book3e_htw_mode;
+
+#define PPC_HTW_NONE	0
+#define PPC_HTW_IBM	1
+#define PPC_HTW_E6500	2
 
 /*
  * 64-bit booke platforms don't load the tlb in the tlb miss handler code.
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 691fd8aca939..f8d1d6dcf7db 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -180,16 +180,17 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 #define MMU_PAGE_64K_AP	3	/* "Admixed pages" (hash64 only) */
 #define MMU_PAGE_256K	4
 #define MMU_PAGE_1M	5
-#define MMU_PAGE_4M	6
-#define MMU_PAGE_8M	7
-#define MMU_PAGE_16M	8
-#define MMU_PAGE_64M	9
-#define MMU_PAGE_256M	10
-#define MMU_PAGE_1G	11
-#define MMU_PAGE_16G	12
-#define MMU_PAGE_64G	13
-
-#define MMU_PAGE_COUNT	14
+#define MMU_PAGE_2M	6
+#define MMU_PAGE_4M	7
+#define MMU_PAGE_8M	8
+#define MMU_PAGE_16M	9
+#define MMU_PAGE_64M	10
+#define MMU_PAGE_256M	11
+#define MMU_PAGE_1G	12
+#define MMU_PAGE_16G	13
+#define MMU_PAGE_64G	14
+
+#define MMU_PAGE_COUNT	15
 
 #if defined(CONFIG_PPC_STD_MMU_64)
 /* 64-bit classic hash table MMU */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index c3523d1dda58..e81731c62a7f 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -113,6 +113,10 @@ struct paca_struct {
 	/* Keep pgd in the same cacheline as the start of extlb */
 	pgd_t *pgd __attribute__((aligned(0x80))); /* Current PGD */
 	pgd_t *kernel_pgd;		/* Kernel PGD */
+
+	/* Shared by all threads of a core -- points to tcd of first thread */
+	struct tlb_core_data *tcd_ptr;
+
 	/* We can have up to 3 levels of reentrancy in the TLB miss handler */
 	u64 extlb[3][EX_TLB_SIZE / sizeof(u64)];
 	u64 exmc[8];		/* used for machine checks */
@@ -123,6 +127,8 @@ struct paca_struct {
 	void *mc_kstack;
 	void *crit_kstack;
 	void *dbg_kstack;
+
+	struct tlb_core_data tcd;
 #endif /* CONFIG_PPC_BOOK3E */
 
 	mm_context_t context;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 41a283956a29..ed8d68ce71f3 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -203,6 +203,15 @@ int main(void)
 	DEFINE(PACA_MC_STACK, offsetof(struct paca_struct, mc_kstack));
 	DEFINE(PACA_CRIT_STACK, offsetof(struct paca_struct, crit_kstack));
 	DEFINE(PACA_DBG_STACK, offsetof(struct paca_struct, dbg_kstack));
+	DEFINE(PACA_TCD_PTR, offsetof(struct paca_struct, tcd_ptr));
+
+	DEFINE(TCD_ESEL_NEXT,
+		offsetof(struct tlb_core_data, esel_next));
+	DEFINE(TCD_ESEL_MAX,
+		offsetof(struct tlb_core_data, esel_max));
+	DEFINE(TCD_ESEL_FIRST,
+		offsetof(struct tlb_core_data, esel_first));
+	DEFINE(TCD_LOCK, offsetof(struct tlb_core_data, lock));
 #endif /* CONFIG_PPC_BOOK3E */
 
 #ifdef CONFIG_PPC_STD_MMU_64
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 623c356fe34f..bf0aada02fe4 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -160,6 +160,11 @@ void __init initialise_paca(struct paca_struct *new_paca, int cpu)
 #ifdef CONFIG_PPC_STD_MMU_64
 	new_paca->slb_shadow_ptr = init_slb_shadow(cpu);
 #endif /* CONFIG_PPC_STD_MMU_64 */
+
+#ifdef CONFIG_PPC_BOOK3E
+	/* For now -- if we have threads this will be adjusted later */
+	new_paca->tcd_ptr = &new_paca->tcd;
+#endif
 }
 
 /* Put the paca pointer into r13 and SPRG_PACA */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 2232aff66059..1ce9b87d7df8 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -97,6 +97,36 @@ int dcache_bsize;
 int icache_bsize;
 int ucache_bsize;
 
+#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
+static void setup_tlb_core_data(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		int first = cpu_first_thread_sibling(cpu);
+
+		paca[cpu].tcd_ptr = &paca[first].tcd;
+
+		/*
+		 * If we have threads, we need either tlbsrx.
+		 * or e6500 tablewalk mode, or else TLB handlers
+		 * will be racy and could produce duplicate entries.
+		 */
+		if (smt_enabled_at_boot >= 2 &&
+		    !mmu_has_feature(MMU_FTR_USE_TLBRSRV) &&
+		    book3e_htw_mode != PPC_HTW_E6500) {
+			/* Should we panic instead? */
+			WARN_ONCE("%s: unsupported MMU configuration -- expect problems\n",
+				  __func__);
+		}
+	}
+}
+#else
+static void setup_tlb_core_data(void)
+{
+}
+#endif
+
 #ifdef CONFIG_SMP
 
 static char *smt_enabled_cmdline;
@@ -445,6 +475,7 @@ void __init setup_system(void)
 
 	smp_setup_cpu_maps();
 	check_smt_enabled();
+	setup_tlb_core_data();
 
 #ifdef CONFIG_SMP
 	/* Release secondary cpus out of their spinloops at 0x60 now that
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index a68671c18ad4..94cd728166d3 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -52,6 +52,7 @@
 #include <asm/smp.h>
 #include <asm/machdep.h>
 #include <asm/setup.h>
+#include <asm/paca.h>
 
 #include "mmu_decl.h"
 
@@ -191,6 +192,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
 	}
 	tlbcam_index = i;
 
+#ifdef CONFIG_PPC64
+	get_paca()->tcd.esel_next = i;
+	get_paca()->tcd.esel_max = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY;
+	get_paca()->tcd.esel_first = i;
+#endif
+
 	return amount_mapped;
 }
 
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 3fa93dc7fe75..94448cd4b444 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -307,6 +307,12 @@ static void __init register_page_bootmem_info(void)
 
 void __init mem_init(void)
 {
+	/*
+	 * book3s is limited to 16 page sizes due to encoding this in
+	 * a 4-bit field for slices.
+	 */
+	BUILD_BUG_ON(MMU_PAGE_COUNT > 16);
+
 #ifdef CONFIG_SWIOTLB
 	swiotlb_init(0);
 #endif
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index b4113bf86353..75f5d2777f61 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -239,6 +239,177 @@ itlb_miss_fault_bolted:
 	beq	tlb_miss_common_bolted
 	b	itlb_miss_kernel_bolted
 
+/*
+ * TLB miss handling for e6500 and derivatives, using hardware tablewalk.
+ *
+ * Linear mapping is bolted: no virtual page table or nested TLB misses
+ * Indirect entries in TLB1, hardware loads resulting direct entries
+ *    into TLB0
+ * No HES or NV hint on TLB1, so we need to do software round-robin
+ * No tlbsrx. so we need a spinlock, and we have to deal
+ *    with MAS-damage caused by tlbsx
+ * 4K pages only
+ */
+
+	START_EXCEPTION(instruction_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR0
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	ori	r16,r16,1
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user/kernel test */
+
+	b	tlb_miss_common_e6500
+
+	START_EXCEPTION(data_tlb_miss_e6500)
+	tlb_prolog_bolted BOOKE_INTERRUPT_DTLB_MISS SPRN_DEAR
+
+	ld	r11,PACA_TCD_PTR(r13)
+	srdi.	r15,r16,60		/* get region */
+	rldicr	r16,r16,0,62
+
+	TLB_MISS_STATS_SAVE_INFO_BOLTED
+	bne	tlb_miss_kernel_e6500	/* user vs kernel check */
+
+/*
+ * This is the guts of the TLB miss handler for e6500 and derivatives.
+ * We are entered with:
+ *
+ * r16 = page of faulting address (low bit 0 if data, 1 if instruction)
+ * r15 = crap (free to use)
+ * r14 = page table base
+ * r13 = PACA
+ * r11 = tlb_per_core ptr
+ * r10 = crap (free to use)
+ */
+tlb_miss_common_e6500:
+	/*
+	 * Search if we already have an indirect entry for that virtual
+	 * address, and if we do, bail out.
+	 *
+	 * MAS6:IND should be already set based on MAS4
+	 */
+	addi	r10,r11,TCD_LOCK
+1:	lbarx	r15,0,r10
+	cmpdi	r15,0
+	bne	2f
+	li	r15,1
+	stbcx.	r15,0,r10
+	bne	1b
+	.subsection 1
+2:	lbz	r15,0(r10)
+	cmpdi	r15,0
+	bne	2b
+	b	1b
+	.previous
+
+	mfspr	r15,SPRN_MAS2
+
+	tlbsx	0,r16
+	mfspr	r10,SPRN_MAS1
+	andis.	r10,r10,MAS1_VALID@h
+	bne	tlb_miss_done_e6500
+
+	/* Undo MAS-damage from the tlbsx */
+	mfspr	r10,SPRN_MAS1
+	oris	r10,r10,MAS1_VALID@h
+	mtspr	SPRN_MAS1,r10
+	mtspr	SPRN_MAS2,r15
+
+	/* Now, we need to walk the page tables. First check if we are in
+	 * range.
+	 */
+	rldicl.	r10,r16,64-PGTABLE_EADDR_SIZE,PGTABLE_EADDR_SIZE+4
+	bne-	tlb_miss_fault_e6500
+
+	rldicl	r15,r16,64-PGDIR_SHIFT+3,64-PGD_INDEX_SIZE-3
+	cmpldi	cr0,r14,0
+	clrrdi	r15,r15,3
+	beq-	tlb_miss_fault_e6500 /* No PGDIR, bail */
+	ldx	r14,r14,r15		/* grab pgd entry */
+
+	rldicl	r15,r16,64-PUD_SHIFT+3,64-PUD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500	/* Bad pgd entry or hugepage; bail */
+	ldx	r14,r14,r15		/* grab pud entry */
+
+	rldicl	r15,r16,64-PMD_SHIFT+3,64-PMD_INDEX_SIZE-3
+	clrrdi	r15,r15,3
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500
+	ldx	r14,r14,r15		/* Grab pmd entry */
+
+	mfspr	r10,SPRN_MAS0
+	cmpdi	cr0,r14,0
+	bge	tlb_miss_fault_e6500
+
+	/* Now we build the MAS for a 2M indirect page:
+	 *
+	 * MAS 0   :	ESEL needs to be filled by software round-robin
+	 * MAS 1   :	Fully set up
+	 *               - PID already updated by caller if necessary
+	 *               - TSIZE for now is base ind page size always
+	 *               - TID already cleared if necessary
+	 * MAS 2   :	Default not 2M-aligned, need to be redone
+	 * MAS 3+7 :	Needs to be done
+	 */
+
+	ori	r14,r14,(BOOK3E_PAGESZ_4K << MAS3_SPSIZE_SHIFT)
+	mtspr	SPRN_MAS7_MAS3,r14
+
+	clrrdi	r15,r16,21		/* make EA 2M-aligned */
+	mtspr	SPRN_MAS2,r15
+
+	lbz	r15,TCD_ESEL_NEXT(r11)
+	lbz	r16,TCD_ESEL_MAX(r11)
+	lbz	r14,TCD_ESEL_FIRST(r11)
+	rlwimi	r10,r15,16,0x00ff0000	/* insert esel_next into MAS0 */
+	addi	r15,r15,1		/* increment esel_next */
+	mtspr	SPRN_MAS0,r10
+	cmpw	r15,r16
+	iseleq	r15,r14,r15		/* if next == last use first */
+	stb	r15,TCD_ESEL_NEXT(r11)
+
+	tlbwe
+
+tlb_miss_done_e6500:
+	.macro	tlb_unlock_e6500
+	li	r15,0
+	isync
+	stb	r15,TCD_LOCK(r11)
+	.endm
+
+	tlb_unlock_e6500
+	TLB_MISS_STATS_X(MMSTAT_TLB_MISS_NORM_OK)
+	tlb_epilog_bolted
+	rfi
+
+tlb_miss_kernel_e6500:
+	mfspr	r10,SPRN_MAS1
+	ld	r14,PACA_KERNELPGD(r13)
+	cmpldi	cr0,r15,8		/* Check for vmalloc region */
+	rlwinm	r10,r10,0,16,1		/* Clear TID */
+	mtspr	SPRN_MAS1,r10
+	beq+	tlb_miss_common_e6500
+
+tlb_miss_fault_e6500:
+	tlb_unlock_e6500
+	/* We need to check if it was an instruction miss */
+	andi.	r16,r16,1
+	bne	itlb_miss_fault_e6500
+dtlb_miss_fault_e6500:
+	TLB_MISS_STATS_D(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_data_storage_book3e
+itlb_miss_fault_e6500:
+	TLB_MISS_STATS_I(MMSTAT_TLB_MISS_NORM_FAULT)
+	tlb_epilog_bolted
+	b	exc_instruction_storage_book3e
+
+
 /**********************************************************************
  *                                                                    *
  * TLB miss handling for Book3E with TLB reservation and HES support  *
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 8805b7b87dc6..735839b74dc5 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -43,6 +43,7 @@
 #include <asm/tlb.h>
 #include <asm/code-patching.h>
 #include <asm/hugetlb.h>
+#include <asm/paca.h>
 
 #include "mmu_decl.h"
 
@@ -58,6 +59,10 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 		.shift	= 12,
 		.enc	= BOOK3E_PAGESZ_4K,
 	},
+	[MMU_PAGE_2M] = {
+		.shift	= 21,
+		.enc	= BOOK3E_PAGESZ_2M,
+	},
 	[MMU_PAGE_4M] = {
 		.shift	= 22,
 		.enc	= BOOK3E_PAGESZ_4M,
@@ -136,7 +141,7 @@ static inline int mmu_get_tsize(int psize)
 int mmu_linear_psize;		/* Page size used for the linear mapping */
 int mmu_pte_psize;		/* Page size used for PTE pages */
 int mmu_vmemmap_psize;		/* Page size used for the virtual mem map */
-int book3e_htw_enabled;		/* Is HW tablewalk enabled ? */
+int book3e_htw_mode;		/* HW tablewalk?  Value is PPC_HTW_* */
 unsigned long linear_map_top;	/* Top of linear mapping */
 
 #endif /* CONFIG_PPC64 */
@@ -377,7 +382,7 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 {
 	int tsize = mmu_psize_defs[mmu_pte_psize].enc;
 
-	if (book3e_htw_enabled) {
+	if (book3e_htw_mode != PPC_HTW_NONE) {
 		unsigned long start = address & PMD_MASK;
 		unsigned long end = address + PMD_SIZE;
 		unsigned long size = 1UL << mmu_psize_defs[mmu_pte_psize].shift;
@@ -430,7 +435,7 @@ static void setup_page_sizes(void)
 			def = &mmu_psize_defs[psize];
 			shift = def->shift;
 
-			if (shift == 0)
+			if (shift == 0 || shift & 1)
 				continue;
 
 			/* adjust to be in terms of 4^shift Kb */
@@ -440,21 +445,40 @@ static void setup_page_sizes(void)
 				def->flags |= MMU_PAGE_SIZE_DIRECT;
 		}
 
-		goto no_indirect;
+		goto out;
 	}
 
 	if (fsl_mmu && (mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2) {
-		u32 tlb1ps = mfspr(SPRN_TLB1PS);
+		u32 tlb1cfg, tlb1ps;
+
+		tlb0cfg = mfspr(SPRN_TLB0CFG);
+		tlb1cfg = mfspr(SPRN_TLB1CFG);
+		tlb1ps = mfspr(SPRN_TLB1PS);
+		eptcfg = mfspr(SPRN_EPTCFG);
+
+		if ((tlb1cfg & TLBnCFG_IND) && (tlb0cfg & TLBnCFG_PT))
+			book3e_htw_mode = PPC_HTW_E6500;
+
+		/*
+		 * We expect 4K subpage size and unrestricted indirect size.
+		 * The lack of a restriction on indirect size is a Freescale
+		 * extension, indicated by PSn = 0 but SPSn != 0.
+		 */
+		if (eptcfg != 2)
+			book3e_htw_mode = PPC_HTW_NONE;
 
 		for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
 			struct mmu_psize_def *def = &mmu_psize_defs[psize];
 
 			if (tlb1ps & (1U << (def->shift - 10))) {
 				def->flags |= MMU_PAGE_SIZE_DIRECT;
+
+				if (book3e_htw_mode && psize == MMU_PAGE_2M)
+					def->flags |= MMU_PAGE_SIZE_INDIRECT;
 			}
 		}
 
-		goto no_indirect;
+		goto out;
 	}
 #endif
 
@@ -471,8 +495,11 @@ static void setup_page_sizes(void)
 	}
 
 	/* Indirect page sizes supported ? */
-	if ((tlb0cfg & TLBnCFG_IND) == 0)
-		goto no_indirect;
+	if ((tlb0cfg & TLBnCFG_IND) == 0 ||
+	    (tlb0cfg & TLBnCFG_PT) == 0)
+		goto out;
+
+	book3e_htw_mode = PPC_HTW_IBM;
 
 	/* Now, we only deal with one IND page size for each
 	 * direct size. Hopefully all implementations today are
@@ -497,8 +524,8 @@ static void setup_page_sizes(void)
 				def->ind = ps + 10;
 		}
 	}
- no_indirect:
 
+out:
 	/* Cleanup array and print summary */
 	pr_info("MMU: Supported page sizes\n");
 	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
@@ -520,23 +547,23 @@ static void setup_page_sizes(void)
 
 static void setup_mmu_htw(void)
 {
-	/* Check if HW tablewalk is present, and if yes, enable it by:
-	 *
-	 * - patching the TLB miss handlers to branch to the
-	 *   one dedicates to it
-	 *
-	 * - setting the global book3e_htw_enabled
-       	 */
-	unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+	/*
+	 * If we want to use HW tablewalk, enable it by patching the TLB miss
+	 * handlers to branch to the one dedicated to it.
+	 */
 
-	if ((tlb0cfg & TLBnCFG_IND) &&
-	    (tlb0cfg & TLBnCFG_PT)) {
+	switch (book3e_htw_mode) {
+	case PPC_HTW_IBM:
 		patch_exception(0x1c0, exc_data_tlb_miss_htw_book3e);
 		patch_exception(0x1e0, exc_instruction_tlb_miss_htw_book3e);
-		book3e_htw_enabled = 1;
+		break;
+	case PPC_HTW_E6500:
+		patch_exception(0x1c0, exc_data_tlb_miss_e6500_book3e);
+		patch_exception(0x1e0, exc_instruction_tlb_miss_e6500_book3e);
+		break;
 	}
 	pr_info("MMU: Book3E HW tablewalk %s\n",
-		book3e_htw_enabled ? "enabled" : "not supported");
+		book3e_htw_mode != PPC_HTW_NONE ? "enabled" : "not supported");
 }
 
 /*
@@ -576,8 +603,16 @@ static void __early_init_mmu(int boot_cpu)
 	/* Set MAS4 based on page table setting */
 
 	mas4 = 0x4 << MAS4_WIMGED_SHIFT;
-	if (book3e_htw_enabled) {
-		mas4 |= mas4 | MAS4_INDD;
+	switch (book3e_htw_mode) {
+	case PPC_HTW_E6500:
+		mas4 |= MAS4_INDD;
+		mas4 |= BOOK3E_PAGESZ_2M << MAS4_TSIZED_SHIFT;
+		mas4 |= MAS4_TLBSELD(1);
+		mmu_pte_psize = MMU_PAGE_2M;
+		break;
+
+	case PPC_HTW_IBM:
+		mas4 |= MAS4_INDD;
 #ifdef CONFIG_PPC_64K_PAGES
 		mas4 |=	BOOK3E_PAGESZ_256M << MAS4_TSIZED_SHIFT;
 		mmu_pte_psize = MMU_PAGE_256M;
@@ -585,13 +620,16 @@ static void __early_init_mmu(int boot_cpu)
 		mas4 |=	BOOK3E_PAGESZ_1M << MAS4_TSIZED_SHIFT;
 		mmu_pte_psize = MMU_PAGE_1M;
 #endif
-	} else {
+		break;
+
+	case PPC_HTW_NONE:
 #ifdef CONFIG_PPC_64K_PAGES
 		mas4 |=	BOOK3E_PAGESZ_64K << MAS4_TSIZED_SHIFT;
 #else
 		mas4 |=	BOOK3E_PAGESZ_4K << MAS4_TSIZED_SHIFT;
 #endif
 		mmu_pte_psize = mmu_virtual_psize;
+		break;
 	}
 	mtspr(SPRN_MAS4, mas4);
 
@@ -611,8 +649,11 @@ static void __early_init_mmu(int boot_cpu)
 		/* limit memory so we dont have linear faults */
 		memblock_enforce_memory_limit(linear_map_top);
 
-		patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
-		patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e);
+		if (book3e_htw_mode == PPC_HTW_NONE) {
+			patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e);
+			patch_exception(0x1e0,
+				exc_instruction_tlb_miss_bolted_book3e);
+		}
 	}
 #endif
 
-- 
cgit v1.2.3


From a655f724df2c0e1634477c7e89da81477a691c0f Mon Sep 17 00:00:00 2001
From: Shaohui Xie <Shaohui.Xie@freescale.com>
Date: Tue, 7 Jan 2014 14:27:41 +0800
Subject: powerpc/85xx: handle the eLBC error interrupt if it exists in dts

On P1020, P1021, P1022, and P1023, eLBC event interrupts are routed
to internal interrupt 3 while ELBC error interrupts are routed to
internal interrupt 0.  We need to call request_irq for each.

Signed-off-by: Shaohui Xie <Shaohui.Xie@freescale.com>
Signed-off-by: Wang Dongsheng <dongsheng.wang@freescale.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
[scottwood@freescale.com: reworded commit message and fixed author]
Signed-off-by: Scott Wood <scottwood@freescale.com>
---
 arch/powerpc/include/asm/fsl_lbc.h |  2 +-
 arch/powerpc/sysdev/fsl_lbc.c      | 31 +++++++++++++++++++++++++------
 2 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/fsl_lbc.h b/arch/powerpc/include/asm/fsl_lbc.h
index 420b45368fcf..067fb0dca549 100644
--- a/arch/powerpc/include/asm/fsl_lbc.h
+++ b/arch/powerpc/include/asm/fsl_lbc.h
@@ -285,7 +285,7 @@ struct fsl_lbc_ctrl {
 	/* device info */
 	struct device			*dev;
 	struct fsl_lbc_regs __iomem	*regs;
-	int				irq;
+	int				irq[2];
 	wait_queue_head_t		irq_wait;
 	spinlock_t			lock;
 	void				*nand;
diff --git a/arch/powerpc/sysdev/fsl_lbc.c b/arch/powerpc/sysdev/fsl_lbc.c
index 6bc5a546d49f..d631022ffb4b 100644
--- a/arch/powerpc/sysdev/fsl_lbc.c
+++ b/arch/powerpc/sysdev/fsl_lbc.c
@@ -214,10 +214,14 @@ static irqreturn_t fsl_lbc_ctrl_irq(int irqno, void *data)
 	struct fsl_lbc_ctrl *ctrl = data;
 	struct fsl_lbc_regs __iomem *lbc = ctrl->regs;
 	u32 status;
+	unsigned long flags;
 
+	spin_lock_irqsave(&fsl_lbc_lock, flags);
 	status = in_be32(&lbc->ltesr);
-	if (!status)
+	if (!status) {
+		spin_unlock_irqrestore(&fsl_lbc_lock, flags);
 		return IRQ_NONE;
+	}
 
 	out_be32(&lbc->ltesr, LTESR_CLEAR);
 	out_be32(&lbc->lteatr, 0);
@@ -260,6 +264,7 @@ static irqreturn_t fsl_lbc_ctrl_irq(int irqno, void *data)
 	if (status & ~LTESR_MASK)
 		dev_err(ctrl->dev, "Unknown error: "
 			"LTESR 0x%08X\n", status);
+	spin_unlock_irqrestore(&fsl_lbc_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -298,8 +303,8 @@ static int fsl_lbc_ctrl_probe(struct platform_device *dev)
 		goto err;
 	}
 
-	fsl_lbc_ctrl_dev->irq = irq_of_parse_and_map(dev->dev.of_node, 0);
-	if (fsl_lbc_ctrl_dev->irq == NO_IRQ) {
+	fsl_lbc_ctrl_dev->irq[0] = irq_of_parse_and_map(dev->dev.of_node, 0);
+	if (!fsl_lbc_ctrl_dev->irq[0]) {
 		dev_err(&dev->dev, "failed to get irq resource\n");
 		ret = -ENODEV;
 		goto err;
@@ -311,20 +316,34 @@ static int fsl_lbc_ctrl_probe(struct platform_device *dev)
 	if (ret < 0)
 		goto err;
 
-	ret = request_irq(fsl_lbc_ctrl_dev->irq, fsl_lbc_ctrl_irq, 0,
+	ret = request_irq(fsl_lbc_ctrl_dev->irq[0], fsl_lbc_ctrl_irq, 0,
 				"fsl-lbc", fsl_lbc_ctrl_dev);
 	if (ret != 0) {
 		dev_err(&dev->dev, "failed to install irq (%d)\n",
-			fsl_lbc_ctrl_dev->irq);
-		ret = fsl_lbc_ctrl_dev->irq;
+			fsl_lbc_ctrl_dev->irq[0]);
+		ret = fsl_lbc_ctrl_dev->irq[0];
 		goto err;
 	}
 
+	fsl_lbc_ctrl_dev->irq[1] = irq_of_parse_and_map(dev->dev.of_node, 1);
+	if (fsl_lbc_ctrl_dev->irq[1]) {
+		ret = request_irq(fsl_lbc_ctrl_dev->irq[1], fsl_lbc_ctrl_irq,
+				IRQF_SHARED, "fsl-lbc-err", fsl_lbc_ctrl_dev);
+		if (ret) {
+			dev_err(&dev->dev, "failed to install irq (%d)\n",
+					fsl_lbc_ctrl_dev->irq[1]);
+			ret = fsl_lbc_ctrl_dev->irq[1];
+			goto err1;
+		}
+	}
+
 	/* Enable interrupts for any detected events */
 	out_be32(&fsl_lbc_ctrl_dev->regs->lteir, LTEIR_ENABLE);
 
 	return 0;
 
+err1:
+	free_irq(fsl_lbc_ctrl_dev->irq[0], fsl_lbc_ctrl_dev);
 err:
 	iounmap(fsl_lbc_ctrl_dev->regs);
 	kfree(fsl_lbc_ctrl_dev);
-- 
cgit v1.2.3


From 7d71d5b2e80623242b0c0823ce6cb635d089c8b2 Mon Sep 17 00:00:00 2001
From: Gerhard Sittig <gsi@denx.de>
Date: Sat, 30 Nov 2013 23:51:27 +0100
Subject: clk: mpc5xxx: switch to COMMON_CLK, retire PPC_CLOCK

the setup before the change was
- arch/powerpc/Kconfig had the PPC_CLOCK option, off by default
- depending on the PPC_CLOCK option the arch/powerpc/kernel/clock.c file
  was built, which implements the clk.h API but always returns -ENOSYS
  unless a platform registers specific callbacks
- the MPC52xx platform selected PPC_CLOCK but did not register any
  callbacks, thus all clk.h API calls keep resulting in -ENOSYS errors
  (which is OK, all peripheral drivers deal with the situation)
- the MPC512x platform selected PPC_CLOCK and registered specific
  callbacks implemented in arch/powerpc/platforms/512x/clock.c, thus
  provided real support for the clock API
- no other powerpc platform did select PPC_CLOCK

the situation after the change is
- the MPC512x platform implements the COMMON_CLK interface, and thus the
  PPC_CLOCK approach in arch/powerpc/platforms/512x/clock.c has become
  obsolete
- the MPC52xx platform still lacks genuine support for the clk.h API
  while this is not a change against the previous situation (the error
  code returned from COMMON_CLK stubs differs but every call still
  results in an error)
- with all references gone, the arch/powerpc/kernel/clock.c wrapper and
  the PPC_CLOCK option have become obsolete, as did the clk_interface.h
  header file

the switch from PPC_CLOCK to COMMON_CLK is done for all platforms within
the same commit such that multiplatform kernels (the combination of 512x
and 52xx within one executable) keep working

Cc: Mike Turquette <mturquette@linaro.org>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Gerhard Sittig <gsi@denx.de>
Signed-off-by: Anatolij Gustschin <agust@denx.de>
---
 arch/powerpc/Kconfig                     |   5 -
 arch/powerpc/include/asm/clk_interface.h |  20 -
 arch/powerpc/kernel/Makefile             |   1 -
 arch/powerpc/kernel/clock.c              |  82 ----
 arch/powerpc/platforms/512x/Kconfig      |   2 +-
 arch/powerpc/platforms/512x/Makefile     |   1 -
 arch/powerpc/platforms/512x/clock.c      | 754 -------------------------------
 arch/powerpc/platforms/52xx/Kconfig      |   2 +-
 8 files changed, 2 insertions(+), 865 deletions(-)
 delete mode 100644 arch/powerpc/include/asm/clk_interface.h
 delete mode 100644 arch/powerpc/kernel/clock.c
 delete mode 100644 arch/powerpc/platforms/512x/clock.c

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f0a893142cee..b131b26ca45a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1040,11 +1040,6 @@ config KEYS_COMPAT
 
 source "crypto/Kconfig"
 
-config PPC_CLOCK
-	bool
-	default n
-	select HAVE_CLK
-
 config PPC_LIB_RHEAP
 	bool
 
diff --git a/arch/powerpc/include/asm/clk_interface.h b/arch/powerpc/include/asm/clk_interface.h
deleted file mode 100644
index ab1882c1e176..000000000000
--- a/arch/powerpc/include/asm/clk_interface.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef __ASM_POWERPC_CLK_INTERFACE_H
-#define __ASM_POWERPC_CLK_INTERFACE_H
-
-#include <linux/clk.h>
-
-struct clk_interface {
-	struct clk*	(*clk_get)	(struct device *dev, const char *id);
-	int		(*clk_enable)	(struct clk *clk);
-	void		(*clk_disable)	(struct clk *clk);
-	unsigned long	(*clk_get_rate)	(struct clk *clk);
-	void		(*clk_put)	(struct clk *clk);
-	long		(*clk_round_rate) (struct clk *clk, unsigned long rate);
-	int 		(*clk_set_rate)	(struct clk *clk, unsigned long rate);
-	int		(*clk_set_parent) (struct clk *clk, struct clk *parent);
-	struct clk*	(*clk_get_parent) (struct clk *clk);
-};
-
-extern struct clk_interface clk_functions;
-
-#endif /* __ASM_POWERPC_CLK_INTERFACE_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 904d713366ff..fcc9a89a4695 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -48,7 +48,6 @@ obj-$(CONFIG_ALTIVEC)		+= vecemu.o
 obj-$(CONFIG_PPC_970_NAP)	+= idle_power4.o
 obj-$(CONFIG_PPC_P7_NAP)	+= idle_power7.o
 obj-$(CONFIG_PPC_OF)		+= of_platform.o prom_parse.o
-obj-$(CONFIG_PPC_CLOCK)		+= clock.o
 procfs-y			:= proc_powerpc.o
 obj-$(CONFIG_PROC_FS)		+= $(procfs-y)
 rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI)	:= rtas_pci.o
diff --git a/arch/powerpc/kernel/clock.c b/arch/powerpc/kernel/clock.c
deleted file mode 100644
index a764b47791e8..000000000000
--- a/arch/powerpc/kernel/clock.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Dummy clk implementations for powerpc.
- * These need to be overridden in platform code.
- */
-
-#include <linux/clk.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <asm/clk_interface.h>
-
-struct clk_interface clk_functions;
-
-struct clk *clk_get(struct device *dev, const char *id)
-{
-	if (clk_functions.clk_get)
-		return clk_functions.clk_get(dev, id);
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
-	if (clk_functions.clk_put)
-		clk_functions.clk_put(clk);
-}
-EXPORT_SYMBOL(clk_put);
-
-int clk_enable(struct clk *clk)
-{
-	if (clk_functions.clk_enable)
-		return clk_functions.clk_enable(clk);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_enable);
-
-void clk_disable(struct clk *clk)
-{
-	if (clk_functions.clk_disable)
-		clk_functions.clk_disable(clk);
-}
-EXPORT_SYMBOL(clk_disable);
-
-unsigned long clk_get_rate(struct clk *clk)
-{
-	if (clk_functions.clk_get_rate)
-		return clk_functions.clk_get_rate(clk);
-	return 0;
-}
-EXPORT_SYMBOL(clk_get_rate);
-
-long clk_round_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk_functions.clk_round_rate)
-		return clk_functions.clk_round_rate(clk, rate);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_round_rate);
-
-int clk_set_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk_functions.clk_set_rate)
-		return clk_functions.clk_set_rate(clk, rate);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_rate);
-
-struct clk *clk_get_parent(struct clk *clk)
-{
-	if (clk_functions.clk_get_parent)
-		return clk_functions.clk_get_parent(clk);
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL(clk_get_parent);
-
-int clk_set_parent(struct clk *clk, struct clk *parent)
-{
-	if (clk_functions.clk_set_parent)
-		return clk_functions.clk_set_parent(clk, parent);
-	return -ENOSYS;
-}
-EXPORT_SYMBOL(clk_set_parent);
diff --git a/arch/powerpc/platforms/512x/Kconfig b/arch/powerpc/platforms/512x/Kconfig
index fc9c1cbfcb1d..5aa3f4b5332c 100644
--- a/arch/powerpc/platforms/512x/Kconfig
+++ b/arch/powerpc/platforms/512x/Kconfig
@@ -1,9 +1,9 @@
 config PPC_MPC512x
 	bool "512x-based boards"
 	depends on 6xx
+	select COMMON_CLK
 	select FSL_SOC
 	select IPIC
-	select PPC_CLOCK
 	select PPC_PCI_CHOICE
 	select FSL_PCI if PCI
 	select ARCH_WANT_OPTIONAL_GPIOLIB
diff --git a/arch/powerpc/platforms/512x/Makefile b/arch/powerpc/platforms/512x/Makefile
index 1e05f9def8a4..01693121a2b1 100644
--- a/arch/powerpc/platforms/512x/Makefile
+++ b/arch/powerpc/platforms/512x/Makefile
@@ -1,7 +1,6 @@
 #
 # Makefile for the Freescale PowerPC 512x linux kernel.
 #
-obj-$(CONFIG_PPC_CLOCK)		+= clock.o
 obj-$(CONFIG_COMMON_CLK)	+= clock-commonclk.o
 obj-y				+= mpc512x_shared.o
 obj-$(CONFIG_MPC5121_ADS)	+= mpc5121_ads.o mpc5121_ads_cpld.o
diff --git a/arch/powerpc/platforms/512x/clock.c b/arch/powerpc/platforms/512x/clock.c
deleted file mode 100644
index fd8a37653417..000000000000
--- a/arch/powerpc/platforms/512x/clock.c
+++ /dev/null
@@ -1,754 +0,0 @@
-/*
- * Copyright (C) 2007,2008 Freescale Semiconductor, Inc. All rights reserved.
- *
- * Author: John Rigby <jrigby@freescale.com>
- *
- * Implements the clk api defined in include/linux/clk.h
- *
- *    Original based on linux/arch/arm/mach-integrator/clock.c
- *
- *    Copyright (C) 2004 ARM Limited.
- *    Written by Deep Blue Solutions Limited.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/clk.h>
-#include <linux/mutex.h>
-#include <linux/io.h>
-
-#include <linux/of_address.h>
-#include <linux/of_platform.h>
-#include <asm/mpc5xxx.h>
-#include <asm/mpc5121.h>
-#include <asm/clk_interface.h>
-
-#include "mpc512x.h"
-
-#undef CLK_DEBUG
-
-static int clocks_initialized;
-
-#define CLK_HAS_RATE	0x1	/* has rate in MHz */
-#define CLK_HAS_CTRL	0x2	/* has control reg and bit */
-
-struct clk {
-	struct list_head node;
-	char name[32];
-	int flags;
-	struct device *dev;
-	unsigned long rate;
-	struct module *owner;
-	void (*calc) (struct clk *);
-	struct clk *parent;
-	int reg, bit;		/* CLK_HAS_CTRL */
-	int div_shift;		/* only used by generic_div_clk_calc */
-};
-
-static LIST_HEAD(clocks);
-static DEFINE_MUTEX(clocks_mutex);
-
-static struct clk *mpc5121_clk_get(struct device *dev, const char *id)
-{
-	struct clk *p, *clk = ERR_PTR(-ENOENT);
-	int dev_match;
-	int id_match;
-
-	if (dev == NULL || id == NULL)
-		return clk;
-
-	mutex_lock(&clocks_mutex);
-	list_for_each_entry(p, &clocks, node) {
-		dev_match = id_match = 0;
-
-		if (dev == p->dev)
-			dev_match++;
-		if (strcmp(id, p->name) == 0)
-			id_match++;
-		if ((dev_match || id_match) && try_module_get(p->owner)) {
-			clk = p;
-			break;
-		}
-	}
-	mutex_unlock(&clocks_mutex);
-
-	return clk;
-}
-
-#ifdef CLK_DEBUG
-static void dump_clocks(void)
-{
-	struct clk *p;
-
-	mutex_lock(&clocks_mutex);
-	printk(KERN_INFO "CLOCKS:\n");
-	list_for_each_entry(p, &clocks, node) {
-		pr_info("  %s=%ld", p->name, p->rate);
-		if (p->parent)
-			pr_cont(" %s=%ld", p->parent->name,
-			       p->parent->rate);
-		if (p->flags & CLK_HAS_CTRL)
-			pr_cont(" reg/bit=%d/%d", p->reg, p->bit);
-		pr_cont("\n");
-	}
-	mutex_unlock(&clocks_mutex);
-}
-#define	DEBUG_CLK_DUMP() dump_clocks()
-#else
-#define	DEBUG_CLK_DUMP()
-#endif
-
-
-static void mpc5121_clk_put(struct clk *clk)
-{
-	module_put(clk->owner);
-}
-
-#define NRPSC 12
-
-struct mpc512x_clockctl {
-	u32 spmr;		/* System PLL Mode Reg */
-	u32 sccr[2];		/* System Clk Ctrl Reg 1 & 2 */
-	u32 scfr1;		/* System Clk Freq Reg 1 */
-	u32 scfr2;		/* System Clk Freq Reg 2 */
-	u32 reserved;
-	u32 bcr;		/* Bread Crumb Reg */
-	u32 pccr[NRPSC];	/* PSC Clk Ctrl Reg 0-11 */
-	u32 spccr;		/* SPDIF Clk Ctrl Reg */
-	u32 cccr;		/* CFM Clk Ctrl Reg */
-	u32 dccr;		/* DIU Clk Cnfg Reg */
-};
-
-static struct mpc512x_clockctl __iomem *clockctl;
-
-static int mpc5121_clk_enable(struct clk *clk)
-{
-	unsigned int mask;
-
-	if (clk->flags & CLK_HAS_CTRL) {
-		mask = in_be32(&clockctl->sccr[clk->reg]);
-		mask |= 1 << clk->bit;
-		out_be32(&clockctl->sccr[clk->reg], mask);
-	}
-	return 0;
-}
-
-static void mpc5121_clk_disable(struct clk *clk)
-{
-	unsigned int mask;
-
-	if (clk->flags & CLK_HAS_CTRL) {
-		mask = in_be32(&clockctl->sccr[clk->reg]);
-		mask &= ~(1 << clk->bit);
-		out_be32(&clockctl->sccr[clk->reg], mask);
-	}
-}
-
-static unsigned long mpc5121_clk_get_rate(struct clk *clk)
-{
-	if (clk->flags & CLK_HAS_RATE)
-		return clk->rate;
-	else
-		return 0;
-}
-
-static long mpc5121_clk_round_rate(struct clk *clk, unsigned long rate)
-{
-	return rate;
-}
-
-static int mpc5121_clk_set_rate(struct clk *clk, unsigned long rate)
-{
-	return 0;
-}
-
-static int clk_register(struct clk *clk)
-{
-	mutex_lock(&clocks_mutex);
-	list_add(&clk->node, &clocks);
-	mutex_unlock(&clocks_mutex);
-	return 0;
-}
-
-static unsigned long spmf_mult(void)
-{
-	/*
-	 * Convert spmf to multiplier
-	 */
-	static int spmf_to_mult[] = {
-		68, 1, 12, 16,
-		20, 24, 28, 32,
-		36, 40, 44, 48,
-		52, 56, 60, 64
-	};
-	int spmf = (in_be32(&clockctl->spmr) >> 24) & 0xf;
-	return spmf_to_mult[spmf];
-}
-
-static unsigned long sysdiv_div_x_2(void)
-{
-	/*
-	 * Convert sysdiv to divisor x 2
-	 * Some divisors have fractional parts so
-	 * multiply by 2 then divide by this value
-	 */
-	static int sysdiv_to_div_x_2[] = {
-		4, 5, 6, 7,
-		8, 9, 10, 14,
-		12, 16, 18, 22,
-		20, 24, 26, 30,
-		28, 32, 34, 38,
-		36, 40, 42, 46,
-		44, 48, 50, 54,
-		52, 56, 58, 62,
-		60, 64, 66,
-	};
-	int sysdiv = (in_be32(&clockctl->scfr2) >> 26) & 0x3f;
-	return sysdiv_to_div_x_2[sysdiv];
-}
-
-static unsigned long ref_to_sys(unsigned long rate)
-{
-	rate *= spmf_mult();
-	rate *= 2;
-	rate /= sysdiv_div_x_2();
-
-	return rate;
-}
-
-static unsigned long sys_to_ref(unsigned long rate)
-{
-	rate *= sysdiv_div_x_2();
-	rate /= 2;
-	rate /= spmf_mult();
-
-	return rate;
-}
-
-static long ips_to_ref(unsigned long rate)
-{
-	int ips_div = (in_be32(&clockctl->scfr1) >> 23) & 0x7;
-
-	rate *= ips_div;	/* csb_clk = ips_clk * ips_div */
-	rate *= 2;		/* sys_clk = csb_clk * 2 */
-	return sys_to_ref(rate);
-}
-
-static unsigned long devtree_getfreq(char *clockname)
-{
-	struct device_node *np;
-	const unsigned int *prop;
-	unsigned int val = 0;
-
-	np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-immr");
-	if (np) {
-		prop = of_get_property(np, clockname, NULL);
-		if (prop)
-			val = *prop;
-	    of_node_put(np);
-	}
-	return val;
-}
-
-static void ref_clk_calc(struct clk *clk)
-{
-	unsigned long rate;
-
-	rate = devtree_getfreq("bus-frequency");
-	if (rate == 0) {
-		printk(KERN_ERR "No bus-frequency in dev tree\n");
-		clk->rate = 0;
-		return;
-	}
-	clk->rate = ips_to_ref(rate);
-}
-
-static struct clk ref_clk = {
-	.name = "ref_clk",
-	.calc = ref_clk_calc,
-};
-
-
-static void sys_clk_calc(struct clk *clk)
-{
-	clk->rate = ref_to_sys(ref_clk.rate);
-}
-
-static struct clk sys_clk = {
-	.name = "sys_clk",
-	.calc = sys_clk_calc,
-};
-
-static void diu_clk_calc(struct clk *clk)
-{
-	int diudiv_x_2 = in_be32(&clockctl->scfr1) & 0xff;
-	unsigned long rate;
-
-	rate = sys_clk.rate;
-
-	rate *= 2;
-	rate /= diudiv_x_2;
-
-	clk->rate = rate;
-}
-
-static void viu_clk_calc(struct clk *clk)
-{
-	unsigned long rate;
-
-	rate = sys_clk.rate;
-	rate /= 2;
-	clk->rate = rate;
-}
-
-static void half_clk_calc(struct clk *clk)
-{
-	clk->rate = clk->parent->rate / 2;
-}
-
-static void generic_div_clk_calc(struct clk *clk)
-{
-	int div = (in_be32(&clockctl->scfr1) >> clk->div_shift) & 0x7;
-
-	clk->rate = clk->parent->rate / div;
-}
-
-static void unity_clk_calc(struct clk *clk)
-{
-	clk->rate = clk->parent->rate;
-}
-
-static struct clk csb_clk = {
-	.name = "csb_clk",
-	.calc = half_clk_calc,
-	.parent = &sys_clk,
-};
-
-static void e300_clk_calc(struct clk *clk)
-{
-	int spmf = (in_be32(&clockctl->spmr) >> 16) & 0xf;
-	int ratex2 = clk->parent->rate * spmf;
-
-	clk->rate = ratex2 / 2;
-}
-
-static struct clk e300_clk = {
-	.name = "e300_clk",
-	.calc = e300_clk_calc,
-	.parent = &csb_clk,
-};
-
-static struct clk ips_clk = {
-	.name = "ips_clk",
-	.calc = generic_div_clk_calc,
-	.parent = &csb_clk,
-	.div_shift = 23,
-};
-
-/*
- * Clocks controlled by SCCR1 (.reg = 0)
- */
-static struct clk lpc_clk = {
-	.name = "lpc_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 0,
-	.bit = 30,
-	.calc = generic_div_clk_calc,
-	.parent = &ips_clk,
-	.div_shift = 11,
-};
-
-static struct clk nfc_clk = {
-	.name = "nfc_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 0,
-	.bit = 29,
-	.calc = generic_div_clk_calc,
-	.parent = &ips_clk,
-	.div_shift = 8,
-};
-
-static struct clk pata_clk = {
-	.name = "pata_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 0,
-	.bit = 28,
-	.calc = unity_clk_calc,
-	.parent = &ips_clk,
-};
-
-/*
- * PSC clocks (bits 27 - 16)
- * are setup elsewhere
- */
-
-static struct clk sata_clk = {
-	.name = "sata_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 0,
-	.bit = 14,
-	.calc = unity_clk_calc,
-	.parent = &ips_clk,
-};
-
-static struct clk fec_clk = {
-	.name = "fec_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 0,
-	.bit = 13,
-	.calc = unity_clk_calc,
-	.parent = &ips_clk,
-};
-
-static struct clk pci_clk = {
-	.name = "pci_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 0,
-	.bit = 11,
-	.calc = generic_div_clk_calc,
-	.parent = &csb_clk,
-	.div_shift = 20,
-};
-
-/*
- * Clocks controlled by SCCR2 (.reg = 1)
- */
-static struct clk diu_clk = {
-	.name = "diu_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 31,
-	.calc = diu_clk_calc,
-};
-
-static struct clk viu_clk = {
-	.name = "viu_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 18,
-	.calc = viu_clk_calc,
-};
-
-static struct clk axe_clk = {
-	.name = "axe_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 30,
-	.calc = unity_clk_calc,
-	.parent = &csb_clk,
-};
-
-static struct clk usb1_clk = {
-	.name = "usb1_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 28,
-	.calc = unity_clk_calc,
-	.parent = &csb_clk,
-};
-
-static struct clk usb2_clk = {
-	.name = "usb2_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 27,
-	.calc = unity_clk_calc,
-	.parent = &csb_clk,
-};
-
-static struct clk i2c_clk = {
-	.name = "i2c_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 26,
-	.calc = unity_clk_calc,
-	.parent = &ips_clk,
-};
-
-static struct clk mscan_clk = {
-	.name = "mscan_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 25,
-	.calc = unity_clk_calc,
-	.parent = &ips_clk,
-};
-
-static struct clk sdhc_clk = {
-	.name = "sdhc_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 24,
-	.calc = unity_clk_calc,
-	.parent = &ips_clk,
-};
-
-static struct clk mbx_bus_clk = {
-	.name = "mbx_bus_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 22,
-	.calc = half_clk_calc,
-	.parent = &csb_clk,
-};
-
-static struct clk mbx_clk = {
-	.name = "mbx_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 21,
-	.calc = unity_clk_calc,
-	.parent = &csb_clk,
-};
-
-static struct clk mbx_3d_clk = {
-	.name = "mbx_3d_clk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 20,
-	.calc = generic_div_clk_calc,
-	.parent = &mbx_bus_clk,
-	.div_shift = 14,
-};
-
-static void psc_mclk_in_calc(struct clk *clk)
-{
-	clk->rate = devtree_getfreq("psc_mclk_in");
-	if (!clk->rate)
-		clk->rate = 25000000;
-}
-
-static struct clk psc_mclk_in = {
-	.name = "psc_mclk_in",
-	.calc = psc_mclk_in_calc,
-};
-
-static struct clk spdif_txclk = {
-	.name = "spdif_txclk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 23,
-};
-
-static struct clk spdif_rxclk = {
-	.name = "spdif_rxclk",
-	.flags = CLK_HAS_CTRL,
-	.reg = 1,
-	.bit = 23,
-};
-
-static void ac97_clk_calc(struct clk *clk)
-{
-	/* ac97 bit clock is always 24.567 MHz */
-	clk->rate = 24567000;
-}
-
-static struct clk ac97_clk = {
-	.name = "ac97_clk_in",
-	.calc = ac97_clk_calc,
-};
-
-static struct clk *rate_clks[] = {
-	&ref_clk,
-	&sys_clk,
-	&diu_clk,
-	&viu_clk,
-	&csb_clk,
-	&e300_clk,
-	&ips_clk,
-	&fec_clk,
-	&sata_clk,
-	&pata_clk,
-	&nfc_clk,
-	&lpc_clk,
-	&mbx_bus_clk,
-	&mbx_clk,
-	&mbx_3d_clk,
-	&axe_clk,
-	&usb1_clk,
-	&usb2_clk,
-	&i2c_clk,
-	&mscan_clk,
-	&sdhc_clk,
-	&pci_clk,
-	&psc_mclk_in,
-	&spdif_txclk,
-	&spdif_rxclk,
-	&ac97_clk,
-	NULL
-};
-
-static void rate_clk_init(struct clk *clk)
-{
-	if (clk->calc) {
-		clk->calc(clk);
-		clk->flags |= CLK_HAS_RATE;
-		clk_register(clk);
-	} else {
-		printk(KERN_WARNING
-		       "Could not initialize clk %s without a calc routine\n",
-		       clk->name);
-	}
-}
-
-static void rate_clks_init(void)
-{
-	struct clk **cpp, *clk;
-
-	cpp = rate_clks;
-	while ((clk = *cpp++))
-		rate_clk_init(clk);
-}
-
-/*
- * There are two clk enable registers with 32 enable bits each
- * psc clocks and device clocks are all stored in dev_clks
- */
-static struct clk dev_clks[2][32];
-
-/*
- * Given a psc number return the dev_clk
- * associated with it
- */
-static struct clk *psc_dev_clk(int pscnum)
-{
-	int reg, bit;
-	struct clk *clk;
-
-	reg = 0;
-	bit = 27 - pscnum;
-
-	clk = &dev_clks[reg][bit];
-	clk->reg = 0;
-	clk->bit = bit;
-	return clk;
-}
-
-/*
- * PSC clock rate calculation
- */
-static void psc_calc_rate(struct clk *clk, int pscnum, struct device_node *np)
-{
-	unsigned long mclk_src = sys_clk.rate;
-	unsigned long mclk_div;
-
-	/*
-	 * Can only change value of mclk divider
-	 * when the divider is disabled.
-	 *
-	 * Zero is not a valid divider so minimum
-	 * divider is 1
-	 *
-	 * disable/set divider/enable
-	 */
-	out_be32(&clockctl->pccr[pscnum], 0);
-	out_be32(&clockctl->pccr[pscnum], 0x00020000);
-	out_be32(&clockctl->pccr[pscnum], 0x00030000);
-
-	if (in_be32(&clockctl->pccr[pscnum]) & 0x80) {
-		clk->rate = spdif_rxclk.rate;
-		return;
-	}
-
-	switch ((in_be32(&clockctl->pccr[pscnum]) >> 14) & 0x3) {
-	case 0:
-		mclk_src = sys_clk.rate;
-		break;
-	case 1:
-		mclk_src = ref_clk.rate;
-		break;
-	case 2:
-		mclk_src = psc_mclk_in.rate;
-		break;
-	case 3:
-		mclk_src = spdif_txclk.rate;
-		break;
-	}
-
-	mclk_div = ((in_be32(&clockctl->pccr[pscnum]) >> 17) & 0x7fff) + 1;
-	clk->rate = mclk_src / mclk_div;
-}
-
-/*
- * Find all psc nodes in device tree and assign a clock
- * with name "psc%d_mclk" and dev pointing at the device
- * returned from of_find_device_by_node
- */
-static void psc_clks_init(void)
-{
-	struct device_node *np;
-	struct platform_device *ofdev;
-	u32 reg;
-	const char *psc_compat;
-
-	psc_compat = mpc512x_select_psc_compat();
-	if (!psc_compat)
-		return;
-
-	for_each_compatible_node(np, NULL, psc_compat) {
-		if (!of_property_read_u32(np, "reg", &reg)) {
-			int pscnum = (reg & 0xf00) >> 8;
-			struct clk *clk = psc_dev_clk(pscnum);
-
-			clk->flags = CLK_HAS_RATE | CLK_HAS_CTRL;
-			ofdev = of_find_device_by_node(np);
-			clk->dev = &ofdev->dev;
-			/*
-			 * AC97 is special rate clock does
-			 * not go through normal path
-			 */
-			if (of_device_is_compatible(np, "fsl,mpc5121-psc-ac97"))
-				clk->rate = ac97_clk.rate;
-			else
-				psc_calc_rate(clk, pscnum, np);
-			sprintf(clk->name, "psc%d_mclk", pscnum);
-			clk_register(clk);
-			clk_enable(clk);
-		}
-	}
-}
-
-static struct clk_interface mpc5121_clk_functions = {
-	.clk_get		= mpc5121_clk_get,
-	.clk_enable		= mpc5121_clk_enable,
-	.clk_disable		= mpc5121_clk_disable,
-	.clk_get_rate		= mpc5121_clk_get_rate,
-	.clk_put		= mpc5121_clk_put,
-	.clk_round_rate		= mpc5121_clk_round_rate,
-	.clk_set_rate		= mpc5121_clk_set_rate,
-	.clk_set_parent		= NULL,
-	.clk_get_parent		= NULL,
-};
-
-int __init mpc5121_clk_init(void)
-{
-	struct device_node *np;
-
-	np = of_find_compatible_node(NULL, NULL, "fsl,mpc5121-clock");
-	if (np) {
-		clockctl = of_iomap(np, 0);
-		of_node_put(np);
-	}
-
-	if (!clockctl) {
-		printk(KERN_ERR "Could not map clock control registers\n");
-		return 0;
-	}
-
-	rate_clks_init();
-	psc_clks_init();
-
-	/* leave clockctl mapped forever */
-	/*iounmap(clockctl); */
-	DEBUG_CLK_DUMP();
-	clocks_initialized++;
-	clk_functions = mpc5121_clk_functions;
-	return 0;
-}
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index af54174801f7..b625a2c6f4f2 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -1,7 +1,7 @@
 config PPC_MPC52xx
 	bool "52xx-based boards"
 	depends on 6xx
-	select PPC_CLOCK
+	select COMMON_CLK
 	select PPC_PCI_CHOICE
 
 config PPC_MPC5200_SIMPLE
-- 
cgit v1.2.3


From 319bbe0ef513e33ecf1a3f40099b5b369122afbd Mon Sep 17 00:00:00 2001
From: Gerhard Sittig <gsi@denx.de>
Date: Tue, 10 Dec 2013 14:11:36 +0100
Subject: powerpc/512x: clk: support MPC5121/5123/5125 SoC variants

improve the common clock support code for MPC512x

- expand the CCM register set declaration with MPC5125 related registers
  (which reside in the previously "reserved" area)
- tell the MPC5121, MPC5123, and MPC5125 SoC variants apart, and derive
  the availability of components and their clocks from the detected SoC
  (MBX, AXE, VIU, SPDIF, PATA, SATA, PCI, second FEC, second SDHC,
  number of PSC components, type of NAND flash controller,
  interpretation of the CPMF bitfield, PSC/CAN mux0 stage input clocks,
  output clocks on SoC pins)
- add backwards compatibility (allow operation against a device tree
  which lacks clock related specs) for MPC5125 FECs, too

telling SoC variants apart and adjusting the clock tree's generation
occurs at runtime, a common generic binary supports all of the chips

the MPC5125 approach to the NFC clock (one register with two counters
for the high and low periods of the clock) is not implemented, as there
are no users and there is no common implementation which supports this
kind of clock -- the new implementation would be unused and could not
get verified, so it shall wait until there is demand

Signed-off-by: Gerhard Sittig <gsi@denx.de>
Acked-by: Mike Turquette <mturquette@linaro.org>
Signed-off-by: Anatolij Gustschin <agust@denx.de>
---
 arch/powerpc/include/asm/mpc5121.h            |   7 +-
 arch/powerpc/platforms/512x/clock-commonclk.c | 339 ++++++++++++++++++++++----
 include/dt-bindings/clock/mpc512x-clock.h     |   9 +-
 3 files changed, 309 insertions(+), 46 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/mpc5121.h b/arch/powerpc/include/asm/mpc5121.h
index 887d3d6133e3..4a69cd1d5041 100644
--- a/arch/powerpc/include/asm/mpc5121.h
+++ b/arch/powerpc/include/asm/mpc5121.h
@@ -37,7 +37,12 @@ struct mpc512x_ccm {
 	u32	cccr;	/* CFM Clock Control Register */
 	u32	dccr;	/* DIU Clock Control Register */
 	u32	mscan_ccr[4];	/* MSCAN Clock Control Registers */
-	u8	res[0x98]; /* Reserved */
+	u32	out_ccr[4];	/* OUT CLK Configure Registers */
+	u32	rsv0[2];	/* Reserved */
+	u32	scfr3;		/* System Clock Frequency Register 3 */
+	u32	rsv1[3];	/* Reserved */
+	u32	spll_lock_cnt;	/* System PLL Lock Counter */
+	u8	res[0x6c];	/* Reserved */
 };
 
 /*
diff --git a/arch/powerpc/platforms/512x/clock-commonclk.c b/arch/powerpc/platforms/512x/clock-commonclk.c
index 16569376b259..6eb614a271fb 100644
--- a/arch/powerpc/platforms/512x/clock-commonclk.c
+++ b/arch/powerpc/platforms/512x/clock-commonclk.c
@@ -36,7 +36,8 @@ enum {
 #define NR_PSCS			12
 #define NR_MSCANS		4
 #define NR_SPDIFS		1
-#define NR_MCLKS		(NR_PSCS + NR_MSCANS + NR_SPDIFS)
+#define NR_OUTCLK		4
+#define NR_MCLKS		(NR_PSCS + NR_MSCANS + NR_SPDIFS + NR_OUTCLK)
 
 /* extend the public set of clocks by adding internal slots for management */
 enum {
@@ -46,11 +47,11 @@ enum {
 	MPC512x_CLK_DDR,
 	MPC512x_CLK_MEM,
 	MPC512x_CLK_IIM,
-	MPC512x_CLK_SDHC_2,
 	/* intermediates in div+gate combos or fractional dividers */
 	MPC512x_CLK_DDR_UG,
 	MPC512x_CLK_SDHC_x4,
 	MPC512x_CLK_SDHC_UG,
+	MPC512x_CLK_SDHC2_UG,
 	MPC512x_CLK_DIU_x4,
 	MPC512x_CLK_DIU_UG,
 	MPC512x_CLK_MBX_BUS_UG,
@@ -76,6 +77,144 @@ static struct clk_onecell_data clk_data;
 static struct mpc512x_ccm __iomem *clkregs;
 static DEFINE_SPINLOCK(clklock);
 
+/* SoC variants {{{ */
+
+/*
+ * tell SoC variants apart as they are rather similar yet not identical,
+ * cache the result in an enum to not repeatedly run the expensive OF test
+ *
+ * MPC5123 is an MPC5121 without the MBX graphics accelerator
+ *
+ * MPC5125 has many more differences: no MBX, no AXE, no VIU, no SPDIF,
+ * no PATA, no SATA, no PCI, two FECs (of different compatibility name),
+ * only 10 PSCs (of different compatibility name), two SDHCs, different
+ * NFC IP block, output clocks, system PLL status query, different CPMF
+ * interpretation, no CFM, different fourth PSC/CAN mux0 input -- yet
+ * those differences can get folded into this clock provider support
+ * code and don't warrant a separate highly redundant implementation
+ */
+
+static enum soc_type {
+	MPC512x_SOC_MPC5121,
+	MPC512x_SOC_MPC5123,
+	MPC512x_SOC_MPC5125,
+} soc;
+
+static void mpc512x_clk_determine_soc(void)
+{
+	if (of_machine_is_compatible("fsl,mpc5121")) {
+		soc = MPC512x_SOC_MPC5121;
+		return;
+	}
+	if (of_machine_is_compatible("fsl,mpc5123")) {
+		soc = MPC512x_SOC_MPC5123;
+		return;
+	}
+	if (of_machine_is_compatible("fsl,mpc5125")) {
+		soc = MPC512x_SOC_MPC5125;
+		return;
+	}
+}
+
+static bool soc_has_mbx(void)
+{
+	if (soc == MPC512x_SOC_MPC5121)
+		return true;
+	return false;
+}
+
+static bool soc_has_axe(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return false;
+	return true;
+}
+
+static bool soc_has_viu(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return false;
+	return true;
+}
+
+static bool soc_has_spdif(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return false;
+	return true;
+}
+
+static bool soc_has_pata(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return false;
+	return true;
+}
+
+static bool soc_has_sata(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return false;
+	return true;
+}
+
+static bool soc_has_pci(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return false;
+	return true;
+}
+
+static bool soc_has_fec2(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return true;
+	return false;
+}
+
+static int soc_max_pscnum(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return 10;
+	return 12;
+}
+
+static bool soc_has_sdhc2(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return true;
+	return false;
+}
+
+static bool soc_has_nfc_5125(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return true;
+	return false;
+}
+
+static bool soc_has_outclk(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return true;
+	return false;
+}
+
+static bool soc_has_cpmf_0_bypass(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return true;
+	return false;
+}
+
+static bool soc_has_mclk_mux0_canin(void)
+{
+	if (soc == MPC512x_SOC_MPC5125)
+		return true;
+	return false;
+}
+
+/* }}} SoC variants */
 /* common clk API wrappers {{{ */
 
 /* convenience wrappers around the common clk API */
@@ -196,12 +335,23 @@ static int get_sys_div_x2(void)
  */
 static int get_cpmf_mult_x2(void)
 {
-	static int cpmf_to_mult[] = {
+	static int cpmf_to_mult_x36[] = {
+		/* 0b000 is "times 36" */
 		72, 2, 2, 3, 4, 5, 6, 7,
 	};
+	static int cpmf_to_mult_0by[] = {
+		/* 0b000 is "bypass" */
+		2, 2, 2, 3, 4, 5, 6, 7,
+	};
+
+	int *cpmf_to_mult;
 	int cpmf;
 
 	cpmf = get_bit_field(&clkregs->spmr, 16, 4);
+	if (soc_has_cpmf_0_bypass())
+		cpmf_to_mult = cpmf_to_mult_0by;
+	else
+		cpmf_to_mult = cpmf_to_mult_x36;
 	return cpmf_to_mult[cpmf];
 }
 
@@ -347,14 +497,19 @@ static void mpc512x_clk_setup_ref_clock(struct device_node *np, int bus_freq,
  * it's the very data type dictated by <linux/clk-provider.h>,
  * "fixing" this warning will break compilation
  */
-static const char *parent_names_mux0[] = {
+static const char *parent_names_mux0_spdif[] = {
 	"sys", "ref", "psc-mclk-in", "spdif-tx",
 };
 
+static const char *parent_names_mux0_canin[] = {
+	"sys", "ref", "psc-mclk-in", "can-clk-in",
+};
+
 enum mclk_type {
 	MCLK_TYPE_PSC,
 	MCLK_TYPE_MSCAN,
 	MCLK_TYPE_SPDIF,
+	MCLK_TYPE_OUTCLK,
 };
 
 struct mclk_setup_data {
@@ -394,6 +549,15 @@ struct mclk_setup_data {
 	"spdif_mclk", \
 }
 
+#define MCLK_SETUP_DATA_OUTCLK(id) { \
+	MCLK_TYPE_OUTCLK, 0, \
+	"out" #id "-mux0", \
+	"out" #id "-en0", \
+	"out" #id "_mclk_div", \
+	{ "out" #id "_mclk_div", "dummy", }, \
+	"out" #id "_clk", \
+}
+
 static struct mclk_setup_data mclk_psc_data[] = {
 	MCLK_SETUP_DATA_PSC(0),
 	MCLK_SETUP_DATA_PSC(1),
@@ -420,6 +584,13 @@ static struct mclk_setup_data mclk_spdif_data[] = {
 	MCLK_SETUP_DATA_SPDIF,
 };
 
+static struct mclk_setup_data mclk_outclk_data[] = {
+	MCLK_SETUP_DATA_OUTCLK(0),
+	MCLK_SETUP_DATA_OUTCLK(1),
+	MCLK_SETUP_DATA_OUTCLK(2),
+	MCLK_SETUP_DATA_OUTCLK(3),
+};
+
 /* setup the MCLK clock subtree of an individual PSC/MSCAN/SPDIF */
 static void mpc512x_clk_setup_mclk(struct mclk_setup_data *entry, size_t idx)
 {
@@ -447,6 +618,13 @@ static void mpc512x_clk_setup_mclk(struct mclk_setup_data *entry, size_t idx)
 			     + (NR_PSCS + NR_MSCANS) * MCLK_MAX_IDX;
 		mccr_reg = &clkregs->spccr;
 		break;
+	case MCLK_TYPE_OUTCLK:
+		clks_idx_pub = MPC512x_CLK_OUT0_CLK + idx;
+		clks_idx_int = MPC512x_CLK_MCLKS_FIRST
+			     + (NR_PSCS + NR_MSCANS + NR_SPDIFS + idx)
+			     * MCLK_MAX_IDX;
+		mccr_reg = &clkregs->out_ccr[idx];
+		break;
 	default:
 		return;
 	}
@@ -495,7 +673,10 @@ static void mpc512x_clk_setup_mclk(struct mclk_setup_data *entry, size_t idx)
 	 */
 	clks[clks_idx_int + MCLK_IDX_MUX0] = mpc512x_clk_muxed(
 			entry->name_mux0,
-			&parent_names_mux0[0], ARRAY_SIZE(parent_names_mux0),
+			soc_has_mclk_mux0_canin()
+				? &parent_names_mux0_canin[0]
+				: &parent_names_mux0_spdif[0],
+			ARRAY_SIZE(parent_names_mux0_spdif),
 			mccr_reg, 14, 2);
 	clks[clks_idx_int + MCLK_IDX_EN0] = mpc512x_clk_gated(
 			entry->name_en0, entry->name_mux0,
@@ -576,6 +757,12 @@ static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq)
 	clks[MPC512x_CLK_SDHC_UG] = mpc512x_clk_divider("sdhc-ug", "sdhc-x4", 0,
 							&clkregs->scfr2, 1, 7,
 							CLK_DIVIDER_ONE_BASED);
+	if (soc_has_sdhc2()) {
+		clks[MPC512x_CLK_SDHC2_UG] = mpc512x_clk_divider(
+				"sdhc2-ug", "sdhc-x4", 0, &clkregs->scfr2,
+				9, 7, CLK_DIVIDER_ONE_BASED);
+	}
+
 	clks[MPC512x_CLK_DIU_x4] = mpc512x_clk_factor("diu-x4", "csb", 4, 1);
 	clks[MPC512x_CLK_DIU_UG] = mpc512x_clk_divider("diu-ug", "diu-x4", 0,
 						       &clkregs->scfr1, 0, 8,
@@ -592,19 +779,32 @@ static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq)
 	div = 2;	/* compensate for the fractional factor */
 	clks[MPC512x_CLK_E300] = mpc512x_clk_factor("e300", "csb", mul, div);
 
-	clks[MPC512x_CLK_MBX_BUS_UG] = mpc512x_clk_factor("mbx-bus-ug", "csb",
-							  1, 2);
-	clks[MPC512x_CLK_MBX_UG] = mpc512x_clk_divtable("mbx-ug", "mbx-bus-ug",
-							&clkregs->scfr1, 14, 3,
-							divtab_1234);
-	clks[MPC512x_CLK_MBX_3D_UG] = mpc512x_clk_factor("mbx-3d-ug", "mbx-ug",
-							 1, 1);
-	clks[MPC512x_CLK_PCI_UG] = mpc512x_clk_divtable("pci-ug", "csb",
-							&clkregs->scfr1, 20, 3,
-							divtab_2346);
-	clks[MPC512x_CLK_NFC_UG] = mpc512x_clk_divtable("nfc-ug", "ips",
-							&clkregs->scfr1, 8, 3,
-							divtab_1234);
+	if (soc_has_mbx()) {
+		clks[MPC512x_CLK_MBX_BUS_UG] = mpc512x_clk_factor(
+				"mbx-bus-ug", "csb", 1, 2);
+		clks[MPC512x_CLK_MBX_UG] = mpc512x_clk_divtable(
+				"mbx-ug", "mbx-bus-ug", &clkregs->scfr1,
+				14, 3, divtab_1234);
+		clks[MPC512x_CLK_MBX_3D_UG] = mpc512x_clk_factor(
+				"mbx-3d-ug", "mbx-ug", 1, 1);
+	}
+	if (soc_has_pci()) {
+		clks[MPC512x_CLK_PCI_UG] = mpc512x_clk_divtable(
+				"pci-ug", "csb", &clkregs->scfr1,
+				20, 3, divtab_2346);
+	}
+	if (soc_has_nfc_5125()) {
+		/*
+		 * XXX TODO implement 5125 NFC clock setup logic,
+		 * with high/low period counters in clkregs->scfr3,
+		 * currently there are no users so it's ENOIMPL
+		 */
+		clks[MPC512x_CLK_NFC_UG] = ERR_PTR(-ENOTSUPP);
+	} else {
+		clks[MPC512x_CLK_NFC_UG] = mpc512x_clk_divtable(
+				"nfc-ug", "ips", &clkregs->scfr1,
+				8, 3, divtab_1234);
+	}
 	clks[MPC512x_CLK_LPC_UG] = mpc512x_clk_divtable("lpc-ug", "ips",
 							&clkregs->scfr1, 11, 3,
 							divtab_1234);
@@ -613,10 +813,12 @@ static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq)
 						  &clkregs->sccr1, 30);
 	clks[MPC512x_CLK_NFC] = mpc512x_clk_gated("nfc", "nfc-ug",
 						  &clkregs->sccr1, 29);
-	clks[MPC512x_CLK_PATA] = mpc512x_clk_gated("pata", "ips",
-						   &clkregs->sccr1, 28);
+	if (soc_has_pata()) {
+		clks[MPC512x_CLK_PATA] = mpc512x_clk_gated(
+				"pata", "ips", &clkregs->sccr1, 28);
+	}
 	/* for PSCs there is a "registers" gate and a bitrate MCLK subtree */
-	for (mclk_idx = 0; mclk_idx < ARRAY_SIZE(mclk_psc_data); mclk_idx++) {
+	for (mclk_idx = 0; mclk_idx < soc_max_pscnum(); mclk_idx++) {
 		char name[12];
 		snprintf(name, sizeof(name), "psc%d", mclk_idx);
 		clks[MPC512x_CLK_PSC0 + mclk_idx] = mpc512x_clk_gated(
@@ -625,19 +827,29 @@ static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq)
 	}
 	clks[MPC512x_CLK_PSC_FIFO] = mpc512x_clk_gated("psc-fifo", "ips",
 						       &clkregs->sccr1, 15);
-	clks[MPC512x_CLK_SATA] = mpc512x_clk_gated("sata", "ips",
-						   &clkregs->sccr1, 14);
+	if (soc_has_sata()) {
+		clks[MPC512x_CLK_SATA] = mpc512x_clk_gated(
+				"sata", "ips", &clkregs->sccr1, 14);
+	}
 	clks[MPC512x_CLK_FEC] = mpc512x_clk_gated("fec", "ips",
 						  &clkregs->sccr1, 13);
-	clks[MPC512x_CLK_PCI] = mpc512x_clk_gated("pci", "pci-ug",
-						  &clkregs->sccr1, 11);
+	if (soc_has_pci()) {
+		clks[MPC512x_CLK_PCI] = mpc512x_clk_gated(
+				"pci", "pci-ug", &clkregs->sccr1, 11);
+	}
 	clks[MPC512x_CLK_DDR] = mpc512x_clk_gated("ddr", "ddr-ug",
 						  &clkregs->sccr1, 10);
+	if (soc_has_fec2()) {
+		clks[MPC512x_CLK_FEC2] = mpc512x_clk_gated(
+				"fec2", "ips", &clkregs->sccr1, 9);
+	}
 
 	clks[MPC512x_CLK_DIU] = mpc512x_clk_gated("diu", "diu-ug",
 						  &clkregs->sccr2, 31);
-	clks[MPC512x_CLK_AXE] = mpc512x_clk_gated("axe", "csb",
-						  &clkregs->sccr2, 30);
+	if (soc_has_axe()) {
+		clks[MPC512x_CLK_AXE] = mpc512x_clk_gated(
+				"axe", "csb", &clkregs->sccr2, 30);
+	}
 	clks[MPC512x_CLK_MEM] = mpc512x_clk_gated("mem", "ips",
 						  &clkregs->sccr2, 29);
 	clks[MPC512x_CLK_USB1] = mpc512x_clk_gated("usb1", "csb",
@@ -654,21 +866,35 @@ static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq)
 	clks[MPC512x_CLK_SDHC] = mpc512x_clk_gated("sdhc", "sdhc-ug",
 						   &clkregs->sccr2, 24);
 	/* there is only one SPDIF component, which shares MCLK support code */
-	clks[MPC512x_CLK_SPDIF] = mpc512x_clk_gated("spdif", "ips",
-						    &clkregs->sccr2, 23);
-	mpc512x_clk_setup_mclk(&mclk_spdif_data[0], 0);
-	clks[MPC512x_CLK_MBX_BUS] = mpc512x_clk_gated("mbx-bus", "mbx-bus-ug",
-						      &clkregs->sccr2, 22);
-	clks[MPC512x_CLK_MBX] = mpc512x_clk_gated("mbx", "mbx-ug",
-						  &clkregs->sccr2, 21);
-	clks[MPC512x_CLK_MBX_3D] = mpc512x_clk_gated("mbx-3d", "mbx-3d-ug",
-						     &clkregs->sccr2, 20);
+	if (soc_has_spdif()) {
+		clks[MPC512x_CLK_SPDIF] = mpc512x_clk_gated(
+				"spdif", "ips", &clkregs->sccr2, 23);
+		mpc512x_clk_setup_mclk(&mclk_spdif_data[0], 0);
+	}
+	if (soc_has_mbx()) {
+		clks[MPC512x_CLK_MBX_BUS] = mpc512x_clk_gated(
+				"mbx-bus", "mbx-bus-ug", &clkregs->sccr2, 22);
+		clks[MPC512x_CLK_MBX] = mpc512x_clk_gated(
+				"mbx", "mbx-ug", &clkregs->sccr2, 21);
+		clks[MPC512x_CLK_MBX_3D] = mpc512x_clk_gated(
+				"mbx-3d", "mbx-3d-ug", &clkregs->sccr2, 20);
+	}
 	clks[MPC512x_CLK_IIM] = mpc512x_clk_gated("iim", "csb",
 						  &clkregs->sccr2, 19);
-	clks[MPC512x_CLK_VIU] = mpc512x_clk_gated("viu", "csb",
-						  &clkregs->sccr2, 18);
-	clks[MPC512x_CLK_SDHC_2] = mpc512x_clk_gated("sdhc-2", "sdhc-ug",
-						     &clkregs->sccr2, 17);
+	if (soc_has_viu()) {
+		clks[MPC512x_CLK_VIU] = mpc512x_clk_gated(
+				"viu", "csb", &clkregs->sccr2, 18);
+	}
+	if (soc_has_sdhc2()) {
+		clks[MPC512x_CLK_SDHC2] = mpc512x_clk_gated(
+				"sdhc-2", "sdhc2-ug", &clkregs->sccr2, 17);
+	}
+
+	if (soc_has_outclk()) {
+		size_t idx;	/* used as mclk_idx, just to trim line length */
+		for (idx = 0; idx < ARRAY_SIZE(mclk_outclk_data); idx++)
+			mpc512x_clk_setup_mclk(&mclk_outclk_data[idx], idx);
+	}
 
 	/*
 	 * externally provided clocks (when implemented in hardware,
@@ -678,10 +904,18 @@ static void mpc512x_clk_setup_clock_tree(struct device_node *np, int busfreq)
 	if (!freq)
 		freq = 25000000;
 	clks[MPC512x_CLK_PSC_MCLK_IN] = mpc512x_clk_fixed("psc_mclk_in", freq);
-	freq = get_freq_from_dt("spdif_tx_in");
-	clks[MPC512x_CLK_SPDIF_TX_IN] = mpc512x_clk_fixed("spdif_tx_in", freq);
-	freq = get_freq_from_dt("spdif_rx_in");
-	clks[MPC512x_CLK_SPDIF_TX_IN] = mpc512x_clk_fixed("spdif_rx_in", freq);
+	if (soc_has_mclk_mux0_canin()) {
+		freq = get_freq_from_dt("can_clk_in");
+		clks[MPC512x_CLK_CAN_CLK_IN] = mpc512x_clk_fixed(
+				"can_clk_in", freq);
+	} else {
+		freq = get_freq_from_dt("spdif_tx_in");
+		clks[MPC512x_CLK_SPDIF_TX_IN] = mpc512x_clk_fixed(
+				"spdif_tx_in", freq);
+		freq = get_freq_from_dt("spdif_rx_in");
+		clks[MPC512x_CLK_SPDIF_TX_IN] = mpc512x_clk_fixed(
+				"spdif_rx_in", freq);
+	}
 
 	/* fixed frequency for AC97, always 24.567MHz */
 	clks[MPC512x_CLK_AC97] = mpc512x_clk_fixed("ac97", 24567000);
@@ -883,6 +1117,20 @@ static void mpc5121_clk_provide_backwards_compat(void)
 		NODE_PREP;
 		NODE_CHK("per", clks[MPC512x_CLK_FEC], 0, FEC);
 	}
+	/*
+	 * MPC5125 has two FECs: FEC1 at 0x2800, FEC2 at 0x4800;
+	 * the clock items don't "form an array" since FEC2 was
+	 * added only later and was not allowed to shift all other
+	 * clock item indices, so the numbers aren't adjacent
+	 */
+	FOR_NODES("fsl,mpc5125-fec") {
+		NODE_PREP;
+		if (res.start & 0x4000)
+			idx = MPC512x_CLK_FEC2;
+		else
+			idx = MPC512x_CLK_FEC;
+		NODE_CHK("per", clks[idx], 0, FEC);
+	}
 
 	FOR_NODES("fsl,mpc5121-usb2-dr") {
 		NODE_PREP;
@@ -932,6 +1180,9 @@ int __init mpc5121_clk_init(void)
 	clkregs = of_iomap(clk_np, 0);
 	WARN_ON(!clkregs);
 
+	/* determine the SoC variant we run on */
+	mpc512x_clk_determine_soc();
+
 	/* invalidate all not yet registered clock slots */
 	mpc512x_clk_preset_data();
 
diff --git a/include/dt-bindings/clock/mpc512x-clock.h b/include/dt-bindings/clock/mpc512x-clock.h
index 9e81b3b99a32..4f94919327ce 100644
--- a/include/dt-bindings/clock/mpc512x-clock.h
+++ b/include/dt-bindings/clock/mpc512x-clock.h
@@ -63,7 +63,14 @@
 #define MPC512x_CLK_PSC9		55
 #define MPC512x_CLK_PSC10		56
 #define MPC512x_CLK_PSC11		57
+#define MPC512x_CLK_SDHC2		58
+#define MPC512x_CLK_FEC2		59
+#define MPC512x_CLK_OUT0_CLK		60
+#define MPC512x_CLK_OUT1_CLK		61
+#define MPC512x_CLK_OUT2_CLK		62
+#define MPC512x_CLK_OUT3_CLK		63
+#define MPC512x_CLK_CAN_CLK_IN		64
 
-#define MPC512x_CLK_LAST_PUBLIC		57
+#define MPC512x_CLK_LAST_PUBLIC		64
 
 #endif
-- 
cgit v1.2.3


From c141611fb1ee2cfc374cf9be5327e97f361c4bed Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 9 Jan 2014 00:44:29 -0500
Subject: powerpc: Delete non-required instances of include <linux/init.h>

None of these files are actually using any __init type directives
and hence don't need to include <linux/init.h>.  Most are just a
left over from __devinit and __cpuinit removal, or simply due to
code getting copied from one driver to the next.

The one instance where we add an include for init.h covers off
a case where that file was implicitly getting it from another
header which itself didn't need it.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/paca.h                | 1 -
 arch/powerpc/include/asm/ppc_asm.h             | 1 -
 arch/powerpc/include/asm/ps3.h                 | 1 -
 arch/powerpc/include/asm/vio.h                 | 1 -
 arch/powerpc/kernel/cacheinfo.c                | 1 -
 arch/powerpc/kernel/crash.c                    | 1 -
 arch/powerpc/kernel/eeh_pe.c                   | 1 -
 arch/powerpc/kernel/head_64.S                  | 1 +
 arch/powerpc/kernel/hw_breakpoint.c            | 1 -
 arch/powerpc/kernel/iomap.c                    | 1 -
 arch/powerpc/kernel/kgdb.c                     | 1 -
 arch/powerpc/kernel/process.c                  | 1 -
 arch/powerpc/kernel/smp-tbsync.c               | 1 -
 arch/powerpc/kernel/syscalls.c                 | 1 -
 arch/powerpc/kernel/vdso32/vdso32_wrapper.S    | 1 -
 arch/powerpc/kernel/vdso64/vdso64_wrapper.S    | 1 -
 arch/powerpc/mm/pgtable.c                      | 1 -
 arch/powerpc/mm/pgtable_64.c                   | 1 -
 arch/powerpc/mm/tlb_hash64.c                   | 1 -
 arch/powerpc/oprofile/op_model_7450.c          | 1 -
 arch/powerpc/oprofile/op_model_cell.c          | 1 -
 arch/powerpc/oprofile/op_model_fsl_emb.c       | 1 -
 arch/powerpc/oprofile/op_model_pa6t.c          | 1 -
 arch/powerpc/oprofile/op_model_power4.c        | 1 -
 arch/powerpc/oprofile/op_model_rs64.c          | 1 -
 arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c | 1 -
 arch/powerpc/platforms/83xx/suspend.c          | 1 -
 arch/powerpc/platforms/85xx/sgy_cts1000.c      | 1 -
 arch/powerpc/platforms/chrp/smp.c              | 1 -
 arch/powerpc/platforms/embedded6xx/hlwd-pic.c  | 1 -
 arch/powerpc/platforms/pasemi/dma_lib.c        | 1 -
 arch/powerpc/platforms/powermac/pfunc_core.c   | 1 -
 arch/powerpc/platforms/powernv/eeh-ioda.c      | 1 -
 arch/powerpc/platforms/pseries/cmm.c           | 1 -
 arch/powerpc/platforms/pseries/dtl.c           | 1 -
 arch/powerpc/sysdev/cpm2_pic.c                 | 1 -
 arch/powerpc/sysdev/fsl_ifc.c                  | 1 -
 arch/powerpc/sysdev/ge/ge_pic.h                | 1 -
 arch/powerpc/sysdev/i8259.c                    | 1 -
 arch/powerpc/sysdev/mpc8xx_pic.c               | 1 -
 arch/powerpc/sysdev/qe_lib/qe_io.c             | 1 -
 arch/powerpc/sysdev/qe_lib/ucc.c               | 1 -
 arch/powerpc/sysdev/qe_lib/ucc_fast.c          | 1 -
 arch/powerpc/sysdev/qe_lib/ucc_slow.c          | 1 -
 arch/powerpc/sysdev/udbg_memcons.c             | 1 -
 arch/powerpc/sysdev/xics/icp-hv.c              | 1 -
 46 files changed, 1 insertion(+), 45 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index c3523d1dda58..68bee4636045 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -16,7 +16,6 @@
 
 #ifdef CONFIG_PPC64
 
-#include <linux/init.h>
 #include <asm/types.h>
 #include <asm/lppaca.h>
 #include <asm/mmu.h>
diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index f595b98079ee..7689e0e98d33 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -4,7 +4,6 @@
 #ifndef _ASM_POWERPC_PPC_ASM_H
 #define _ASM_POWERPC_PPC_ASM_H
 
-#include <linux/init.h>
 #include <linux/stringify.h>
 #include <asm/asm-compat.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/include/asm/ps3.h b/arch/powerpc/include/asm/ps3.h
index 678a7c1d9cb8..a1bc7e758422 100644
--- a/arch/powerpc/include/asm/ps3.h
+++ b/arch/powerpc/include/asm/ps3.h
@@ -21,7 +21,6 @@
 #if !defined(_ASM_POWERPC_PS3_H)
 #define _ASM_POWERPC_PS3_H
 
-#include <linux/init.h>
 #include <linux/types.h>
 #include <linux/device.h>
 #include <asm/cell-pmu.h>
diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h
index 68d0cc998b1b..4f9b7ca0710f 100644
--- a/arch/powerpc/include/asm/vio.h
+++ b/arch/powerpc/include/asm/vio.h
@@ -15,7 +15,6 @@
 #define _ASM_POWERPC_VIO_H
 #ifdef __KERNEL__
 
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index 654932727873..abfa011344d9 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -12,7 +12,6 @@
 
 #include <linux/cpu.h>
 #include <linux/cpumask.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c
index fdcd8f551aff..18d7c80ddeb9 100644
--- a/arch/powerpc/kernel/crash.c
+++ b/arch/powerpc/kernel/crash.c
@@ -17,7 +17,6 @@
 #include <linux/export.h>
 #include <linux/crash_dump.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/types.h>
 
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index f9450537e335..1feccd64f955 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -25,7 +25,6 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/gfp.h>
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/string.h>
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 4f0946de2d5c..b7363bd42452 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -23,6 +23,7 @@
  */
 
 #include <linux/threads.h>
+#include <linux/init.h>
 #include <asm/reg.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index f0b47d1a6b0e..b0a1792279bb 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -28,7 +28,6 @@
 #include <linux/percpu.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 
 #include <asm/hw_breakpoint.h>
diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c
index 97a3715ac8bd..b82227e7e21b 100644
--- a/arch/powerpc/kernel/iomap.c
+++ b/arch/powerpc/kernel/iomap.c
@@ -3,7 +3,6 @@
  *
  * (C) Copyright 2004 Linus Torvalds
  */
-#include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/mm.h>
 #include <linux/export.h>
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 83e89d310734..8504657379f1 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -15,7 +15,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/kgdb.h>
 #include <linux/smp.h>
 #include <linux/signal.h>
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 3386d8ab7eb0..bf8e136316e2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -25,7 +25,6 @@
 #include <linux/slab.h>
 #include <linux/user.h>
 #include <linux/elf.h>
-#include <linux/init.h>
 #include <linux/prctl.h>
 #include <linux/init_task.h>
 #include <linux/export.h>
diff --git a/arch/powerpc/kernel/smp-tbsync.c b/arch/powerpc/kernel/smp-tbsync.c
index e68fd1ae727a..7a37ecd3afa3 100644
--- a/arch/powerpc/kernel/smp-tbsync.c
+++ b/arch/powerpc/kernel/smp-tbsync.c
@@ -9,7 +9,6 @@
 #include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/unistd.h>
-#include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/atomic.h>
 #include <asm/smp.h>
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index 4e3cc47f26b9..cd9be9aa016d 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -34,7 +34,6 @@
 #include <linux/ipc.h>
 #include <linux/utsname.h>
 #include <linux/file.h>
-#include <linux/init.h>
 #include <linux/personality.h>
 
 #include <asm/uaccess.h>
diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
index 6e8f507ed32b..79683d0393f5 100644
--- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
+++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/page.h>
 
diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
index b8553d62b792..8df9e2463007 100644
--- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
+++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S
@@ -1,4 +1,3 @@
-#include <linux/init.h>
 #include <linux/linkage.h>
 #include <asm/page.h>
 
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index ad90429bbd8b..c695943a513c 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -24,7 +24,6 @@
 #include <linux/kernel.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/hugetlb.h>
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 02e8681fb865..7f0e872ab83d 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -33,7 +33,6 @@
 #include <linux/swap.h>
 #include <linux/stddef.h>
 #include <linux/vmalloc.h>
-#include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
 #include <linux/slab.h>
diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c
index 36e44b4260eb..c99f6510a0b2 100644
--- a/arch/powerpc/mm/tlb_hash64.c
+++ b/arch/powerpc/mm/tlb_hash64.c
@@ -23,7 +23,6 @@
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/init.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <asm/pgalloc.h>
diff --git a/arch/powerpc/oprofile/op_model_7450.c b/arch/powerpc/oprofile/op_model_7450.c
index ff617246d128..d29b6e4e5e72 100644
--- a/arch/powerpc/oprofile/op_model_7450.c
+++ b/arch/powerpc/oprofile/op_model_7450.c
@@ -16,7 +16,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index b9589c19ccda..1f0ebdeea5f7 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -16,7 +16,6 @@
 
 #include <linux/cpufreq.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/jiffies.h>
 #include <linux/kthread.h>
 #include <linux/oprofile.h>
diff --git a/arch/powerpc/oprofile/op_model_fsl_emb.c b/arch/powerpc/oprofile/op_model_fsl_emb.c
index 2a82d3ed464d..14cf86fdddab 100644
--- a/arch/powerpc/oprofile/op_model_fsl_emb.c
+++ b/arch/powerpc/oprofile/op_model_fsl_emb.c
@@ -14,7 +14,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/oprofile/op_model_pa6t.c b/arch/powerpc/oprofile/op_model_pa6t.c
index 42f778dff919..a114a7c22d40 100644
--- a/arch/powerpc/oprofile/op_model_pa6t.c
+++ b/arch/powerpc/oprofile/op_model_pa6t.c
@@ -22,7 +22,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/oprofile/op_model_power4.c b/arch/powerpc/oprofile/op_model_power4.c
index f444b94935f5..962fe7b3e3fb 100644
--- a/arch/powerpc/oprofile/op_model_power4.c
+++ b/arch/powerpc/oprofile/op_model_power4.c
@@ -10,7 +10,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/firmware.h>
 #include <asm/ptrace.h>
diff --git a/arch/powerpc/oprofile/op_model_rs64.c b/arch/powerpc/oprofile/op_model_rs64.c
index 9b801b8c8c5a..7e5b8ed3a1b7 100644
--- a/arch/powerpc/oprofile/op_model_rs64.c
+++ b/arch/powerpc/oprofile/op_model_rs64.c
@@ -8,7 +8,6 @@
  */
 
 #include <linux/oprofile.h>
-#include <linux/init.h>
 #include <linux/smp.h>
 #include <asm/ptrace.h>
 #include <asm/processor.h>
diff --git a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
index fd71cfdf2380..e238b6a55b15 100644
--- a/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
+++ b/arch/powerpc/platforms/83xx/mcu_mpc8349emitx.c
@@ -11,7 +11,6 @@
  * (at your option) any later version.
  */
 
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c
index 3d9716ccd327..4b4c081df94d 100644
--- a/arch/powerpc/platforms/83xx/suspend.c
+++ b/arch/powerpc/platforms/83xx/suspend.c
@@ -10,7 +10,6 @@
  * by the Free Software Foundation.
  */
 
-#include <linux/init.h>
 #include <linux/pm.h>
 #include <linux/types.h>
 #include <linux/ioport.h>
diff --git a/arch/powerpc/platforms/85xx/sgy_cts1000.c b/arch/powerpc/platforms/85xx/sgy_cts1000.c
index b9197cea1854..bb75add67084 100644
--- a/arch/powerpc/platforms/85xx/sgy_cts1000.c
+++ b/arch/powerpc/platforms/85xx/sgy_cts1000.c
@@ -14,7 +14,6 @@
 #include <linux/platform_device.h>
 #include <linux/device.h>
 #include <linux/module.h>
-#include <linux/init.h>
 #include <linux/of_gpio.h>
 #include <linux/of_irq.h>
 #include <linux/workqueue.h>
diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c
index dead91b177b9..b6c9a0dcc924 100644
--- a/arch/powerpc/platforms/chrp/smp.c
+++ b/arch/powerpc/platforms/chrp/smp.c
@@ -14,7 +14,6 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/spinlock.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
index 6c03034dbbd3..c269caee58f9 100644
--- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
+++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c
@@ -15,7 +15,6 @@
 #define pr_fmt(fmt) DRV_MODULE_NAME ": " fmt
 
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c
index f3defd8a2806..aafa01ba062f 100644
--- a/arch/powerpc/platforms/pasemi/dma_lib.c
+++ b/arch/powerpc/platforms/pasemi/dma_lib.c
@@ -18,7 +18,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pci.h>
 #include <linux/slab.h>
diff --git a/arch/powerpc/platforms/powermac/pfunc_core.c b/arch/powerpc/platforms/powermac/pfunc_core.c
index d588e48dff74..43075081721f 100644
--- a/arch/powerpc/platforms/powermac/pfunc_core.c
+++ b/arch/powerpc/platforms/powermac/pfunc_core.c
@@ -5,7 +5,6 @@
  * FIXME: LOCKING !!!
  */
 
-#include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 9b1db254898a..007ac4989841 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -14,7 +14,6 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/delay.h>
-#include <linux/init.h>
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/kernel.h>
diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c
index 1e561bef459b..2d8bf15879fd 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -25,7 +25,6 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/gfp.h>
-#include <linux/init.h>
 #include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/oom.h>
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index 5db66f1fbc26..7d61498e45c0 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -20,7 +20,6 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/spinlock.h>
diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c
index 10386b676d87..a11bd1d433ad 100644
--- a/arch/powerpc/sysdev/cpm2_pic.c
+++ b/arch/powerpc/sysdev/cpm2_pic.c
@@ -27,7 +27,6 @@
  */
 
 #include <linux/stddef.h>
-#include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/irq.h>
diff --git a/arch/powerpc/sysdev/fsl_ifc.c b/arch/powerpc/sysdev/fsl_ifc.c
index d7fc72239144..fbc885b31946 100644
--- a/arch/powerpc/sysdev/fsl_ifc.c
+++ b/arch/powerpc/sysdev/fsl_ifc.c
@@ -19,7 +19,6 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
-#include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/compiler.h>
diff --git a/arch/powerpc/sysdev/ge/ge_pic.h b/arch/powerpc/sysdev/ge/ge_pic.h
index 6149916da3f4..908dbd9826b6 100644
--- a/arch/powerpc/sysdev/ge/ge_pic.h
+++ b/arch/powerpc/sysdev/ge/ge_pic.h
@@ -1,7 +1,6 @@
 #ifndef __GEF_PIC_H__
 #define __GEF_PIC_H__
 
-#include <linux/init.h>
 
 void gef_pic_cascade(unsigned int, struct irq_desc *);
 unsigned int gef_pic_get_irq(void);
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 997df6a7ab5d..45598da0b321 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -8,7 +8,6 @@
  */
 #undef DEBUG
 
-#include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c
index b724622c3a0b..c4828c0be5bd 100644
--- a/arch/powerpc/sysdev/mpc8xx_pic.c
+++ b/arch/powerpc/sysdev/mpc8xx_pic.c
@@ -1,6 +1,5 @@
 #include <linux/kernel.h>
 #include <linux/stddef.h>
-#include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/irq.h>
diff --git a/arch/powerpc/sysdev/qe_lib/qe_io.c b/arch/powerpc/sysdev/qe_lib/qe_io.c
index a88807b3dd57..d09994164daf 100644
--- a/arch/powerpc/sysdev/qe_lib/qe_io.c
+++ b/arch/powerpc/sysdev/qe_lib/qe_io.c
@@ -16,7 +16,6 @@
 
 #include <linux/stddef.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
diff --git a/arch/powerpc/sysdev/qe_lib/ucc.c b/arch/powerpc/sysdev/qe_lib/ucc.c
index 134b07d29435..621575b7e84a 100644
--- a/arch/powerpc/sysdev/qe_lib/ucc.c
+++ b/arch/powerpc/sysdev/qe_lib/ucc.c
@@ -14,7 +14,6 @@
  * option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/stddef.h>
 #include <linux/spinlock.h>
diff --git a/arch/powerpc/sysdev/qe_lib/ucc_fast.c b/arch/powerpc/sysdev/qe_lib/ucc_fast.c
index cceb2e366738..65aaf15032ae 100644
--- a/arch/powerpc/sysdev/qe_lib/ucc_fast.c
+++ b/arch/powerpc/sysdev/qe_lib/ucc_fast.c
@@ -13,7 +13,6 @@
  * option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
diff --git a/arch/powerpc/sysdev/qe_lib/ucc_slow.c b/arch/powerpc/sysdev/qe_lib/ucc_slow.c
index 1c062f48f1ac..befaf1123f7f 100644
--- a/arch/powerpc/sysdev/qe_lib/ucc_slow.c
+++ b/arch/powerpc/sysdev/qe_lib/ucc_slow.c
@@ -13,7 +13,6 @@
  * option) any later version.
  */
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
diff --git a/arch/powerpc/sysdev/udbg_memcons.c b/arch/powerpc/sysdev/udbg_memcons.c
index ce5a7b489e4b..9998c0de12d0 100644
--- a/arch/powerpc/sysdev/udbg_memcons.c
+++ b/arch/powerpc/sysdev/udbg_memcons.c
@@ -18,7 +18,6 @@
  *      2 of the License, or (at your option) any later version.
  */
 
-#include <linux/init.h>
 #include <linux/kernel.h>
 #include <asm/barrier.h>
 #include <asm/page.h>
diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c
index df0fc5821469..c1917cf67c3d 100644
--- a/arch/powerpc/sysdev/xics/icp-hv.c
+++ b/arch/powerpc/sysdev/xics/icp-hv.c
@@ -12,7 +12,6 @@
 #include <linux/irq.h>
 #include <linux/smp.h>
 #include <linux/interrupt.h>
-#include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/of.h>
 
-- 
cgit v1.2.3


From 1d350544d5bf17d835d2850004c64ca51235c771 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Fri, 3 Jan 2014 17:47:12 +0800
Subject: powerpc/eeh: Add restore_config operation

After reset on the specific PE or PHB, we never configure AER
correctly on PowerNV platform. We needn't care it on pSeries
platform. The patch introduces additional EEH operation eeh_ops::
restore_config() so that we have chance to configure AER correctly
for PowerNV platform.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/eeh.h               | 1 +
 arch/powerpc/kernel/eeh_pe.c                 | 3 +++
 arch/powerpc/platforms/powernv/eeh-powernv.c | 3 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 4 +++-
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d3e5e9bc8f94..7f8adc848cd6 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -157,6 +157,7 @@ struct eeh_ops {
 	int (*read_config)(struct device_node *dn, int where, int size, u32 *val);
 	int (*write_config)(struct device_node *dn, int where, int size, u32 val);
 	int (*next_error)(struct eeh_pe **pe);
+	int (*restore_config)(struct device_node *dn);
 };
 
 extern struct eeh_ops *eeh_ops;
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 1feccd64f955..f0c353fa655a 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -736,6 +736,9 @@ static void *eeh_restore_one_device_bars(void *data, void *flag)
 	else
 		eeh_restore_device_bars(edev, dn);
 
+	if (eeh_ops->restore_config)
+		eeh_ops->restore_config(dn);
+
 	return NULL;
 }
 
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 73b981438cc5..ab91e6a34afa 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -359,7 +359,8 @@ static struct eeh_ops powernv_eeh_ops = {
 	.configure_bridge       = powernv_eeh_configure_bridge,
 	.read_config            = pnv_pci_cfg_read,
 	.write_config           = pnv_pci_cfg_write,
-	.next_error		= powernv_eeh_next_error
+	.next_error		= powernv_eeh_next_error,
+	.restore_config		= NULL
 };
 
 /**
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index ccb633e077b1..9ef3cc8ebc11 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -689,7 +689,9 @@ static struct eeh_ops pseries_eeh_ops = {
 	.get_log		= pseries_eeh_get_log,
 	.configure_bridge       = pseries_eeh_configure_bridge,
 	.read_config		= pseries_eeh_read_config,
-	.write_config		= pseries_eeh_write_config
+	.write_config		= pseries_eeh_write_config,
+	.next_error		= NULL,
+	.restore_config		= NULL
 };
 
 /**
-- 
cgit v1.2.3


From 9be3becc2f99f4f2b1b697a616b1aa9e7889d68f Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Fri, 3 Jan 2014 17:47:13 +0800
Subject: powerpc/eeh: Call opal_pci_reinit() on powernv for restoring config
 space

The patch implements the EEH operation backend restore_config()
for PowerNV platform. That relies on OPAL API opal_pci_reinit()
where we reinitialize the error reporting properly after PE or
PHB reset.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/opal.h              |  8 ++++++--
 arch/powerpc/platforms/powernv/eeh-powernv.c | 23 ++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index a4041e9ed550..9a87b4401a41 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -344,12 +344,16 @@ enum OpalMveEnableAction {
 	OPAL_ENABLE_MVE = 1
 };
 
-enum OpalPciResetAndReinitScope {
+enum OpalPciResetScope {
 	OPAL_PHB_COMPLETE = 1, OPAL_PCI_LINK = 2, OPAL_PHB_ERROR = 3,
 	OPAL_PCI_HOT_RESET = 4, OPAL_PCI_FUNDAMENTAL_RESET = 5,
 	OPAL_PCI_IODA_TABLE_RESET = 6,
 };
 
+enum OpalPciReinitScope {
+	OPAL_REINIT_PCI_DEV = 1000
+};
+
 enum OpalPciResetState {
 	OPAL_DEASSERT_RESET = 0,
 	OPAL_ASSERT_RESET = 1
@@ -801,7 +805,7 @@ int64_t opal_pci_get_phb_diag_data(uint64_t phb_id, void *diag_buffer,
 int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id, void *diag_buffer,
 				    uint64_t diag_buffer_len);
 int64_t opal_pci_fence_phb(uint64_t phb_id);
-int64_t opal_pci_reinit(uint64_t phb_id, uint8_t reinit_scope);
+int64_t opal_pci_reinit(uint64_t phb_id, uint64_t reinit_scope, uint64_t data);
 int64_t opal_pci_mask_pe_error(uint64_t phb_id, uint16_t pe_number, uint8_t error_type, uint8_t mask_action);
 int64_t opal_set_slot_led_status(uint64_t phb_id, uint64_t slot_id, uint8_t led_type, uint8_t led_action);
 int64_t opal_get_epow_status(__be64 *status);
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index ab91e6a34afa..a79fddc5e74e 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -344,6 +344,27 @@ static int powernv_eeh_next_error(struct eeh_pe **pe)
 	return -EEXIST;
 }
 
+static int powernv_eeh_restore_config(struct device_node *dn)
+{
+	struct eeh_dev *edev = of_node_to_eeh_dev(dn);
+	struct pnv_phb *phb;
+	s64 ret;
+
+	if (!edev)
+		return -EEXIST;
+
+	phb = edev->phb->private_data;
+	ret = opal_pci_reinit(phb->opal_id,
+			      OPAL_REINIT_PCI_DEV, edev->config_addr);
+	if (ret) {
+		pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
+			__func__, edev->config_addr, ret);
+		return -EIO;
+	}
+
+	return 0;
+}
+
 static struct eeh_ops powernv_eeh_ops = {
 	.name                   = "powernv",
 	.init                   = powernv_eeh_init,
@@ -360,7 +381,7 @@ static struct eeh_ops powernv_eeh_ops = {
 	.read_config            = pnv_pci_cfg_read,
 	.write_config           = pnv_pci_cfg_write,
 	.next_error		= powernv_eeh_next_error,
-	.restore_config		= NULL
+	.restore_config		= powernv_eeh_restore_config
 };
 
 /**
-- 
cgit v1.2.3


From f26c7a035b7f2f1a7505ce42e4ba946b12f7df91 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Sun, 12 Jan 2014 14:13:45 +0800
Subject: powerpc/eeh: Hotplug improvement

When EEH error comes to one specific PCI device before its driver
is loaded, we will apply hotplug to recover the error. During the
plug time, the PCI device will be probed and its driver is loaded.
Then we wrongly calls to the error handlers if the driver supports
EEH explicitly.

The patch intends to fix by introducing flag EEH_DEV_NO_HANDLER and
set it before we remove the PCI device. In turn, we can avoid wrongly
calls the error handlers of the PCI device after its driver loaded.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/eeh.h   |  3 ++-
 arch/powerpc/kernel/eeh.c        | 15 +++++++++++++++
 arch/powerpc/kernel/eeh_driver.c | 10 +++++++---
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7f8adc848cd6..8b4b8e4a5c32 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -90,7 +90,8 @@ struct eeh_pe {
 #define EEH_DEV_IRQ_DISABLED	(1 << 3)	/* Interrupt disabled	*/
 #define EEH_DEV_DISCONNECTED	(1 << 4)	/* Removing from PE	*/
 
-#define EEH_DEV_SYSFS		(1 << 8)	/* Sysfs created        */
+#define EEH_DEV_NO_HANDLER	(1 << 8)	/* No error handler	*/
+#define EEH_DEV_SYSFS		(1 << 9)	/* Sysfs created	*/
 
 struct eeh_dev {
 	int mode;			/* EEH mode			*/
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index f4b7a227f183..148db72a8c43 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -921,6 +921,13 @@ void eeh_add_device_late(struct pci_dev *dev)
 		eeh_sysfs_remove_device(edev->pdev);
 		edev->mode &= ~EEH_DEV_SYSFS;
 
+		/*
+		 * We definitely should have the PCI device removed
+		 * though it wasn't correctly. So we needn't call
+		 * into error handler afterwards.
+		 */
+		edev->mode |= EEH_DEV_NO_HANDLER;
+
 		edev->pdev = NULL;
 		dev->dev.archdata.edev = NULL;
 	}
@@ -1023,6 +1030,14 @@ void eeh_remove_device(struct pci_dev *dev)
 	else
 		edev->mode |= EEH_DEV_DISCONNECTED;
 
+	/*
+	 * We're removing from the PCI subsystem, that means
+	 * the PCI device driver can't support EEH or not
+	 * well. So we rely on hotplug completely to do recovery
+	 * for the specific PCI device.
+	 */
+	edev->mode |= EEH_DEV_NO_HANDLER;
+
 	eeh_addr_cache_rmv_dev(dev);
 	eeh_sysfs_remove_device(dev);
 	edev->mode &= ~EEH_DEV_SYSFS;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 4ef59c33777f..7db39203a073 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -217,7 +217,8 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
 	if (!driver) return NULL;
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->mmio_enabled) {
+	    !driver->err_handler->mmio_enabled ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
 		eeh_pcid_put(dev);
 		return NULL;
 	}
@@ -258,7 +259,8 @@ static void *eeh_report_reset(void *data, void *userdata)
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->slot_reset) {
+	    !driver->err_handler->slot_reset ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
 		eeh_pcid_put(dev);
 		return NULL;
 	}
@@ -297,7 +299,9 @@ static void *eeh_report_resume(void *data, void *userdata)
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
-	    !driver->err_handler->resume) {
+	    !driver->err_handler->resume ||
+	    (edev->mode & EEH_DEV_NO_HANDLER)) {
+		edev->mode &= ~EEH_DEV_NO_HANDLER;
 		eeh_pcid_put(dev);
 		return NULL;
 	}
-- 
cgit v1.2.3


From d4edc5b6c480a0917e61d93d55531d7efa6230be Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Mon, 30 Dec 2013 17:05:34 +0530
Subject: powerpc: Fix the setup of CPU-to-Node mappings during CPU online

On POWER platforms, the hypervisor can notify the guest kernel about dynamic
changes in the cpu-numa associativity (VPHN topology update). Hence the
cpu-to-node mappings that we got from the firmware during boot, may no longer
be valid after such updates. This is handled using the arch_update_cpu_topology()
hook in the scheduler, and the sched-domains are rebuilt according to the new
mappings.

But unfortunately, at the moment, CPU hotplug ignores these updated mappings
and instead queries the firmware for the cpu-to-numa relationships and uses
them during CPU online. So the kernel can end up assigning wrong NUMA nodes
to CPUs during subsequent CPU hotplug online operations (after booting).

Further, a particularly problematic scenario can result from this bug:
On POWER platforms, the SMT mode can be switched between 1, 2, 4 (and even 8)
threads per core. The switch to Single-Threaded (ST) mode is performed by
offlining all except the first CPU thread in each core. Switching back to
SMT mode involves onlining those other threads back, in each core.

Now consider this scenario:

1. During boot, the kernel gets the cpu-to-node mappings from the firmware
   and assigns the CPUs to NUMA nodes appropriately, during CPU online.

2. Later on, the hypervisor updates the cpu-to-node mappings dynamically and
   communicates this update to the kernel. The kernel in turn updates its
   cpu-to-node associations and rebuilds its sched domains. Everything is
   fine so far.

3. Now, the user switches the machine from SMT to ST mode (say, by running
   ppc64_cpu --smt=1). This involves offlining all except 1 thread in each
   core.

4. The user then tries to switch back from ST to SMT mode (say, by running
   ppc64_cpu --smt=4), and this involves onlining those threads back. Since
   CPU hotplug ignores the new mappings, it queries the firmware and tries to
   associate the newly onlined sibling threads to the old NUMA nodes. This
   results in sibling threads within the same core getting associated with
   different NUMA nodes, which is incorrect.

   The scheduler's build-sched-domains code gets thoroughly confused with this
   and enters an infinite loop and causes soft-lockups, as explained in detail
   in commit 3be7db6ab (powerpc: VPHN topology change updates all siblings).

So to fix this, use the numa_cpu_lookup_table to remember the updated
cpu-to-node mappings, and use them during CPU hotplug online operations.
Further, we also need to ensure that all threads in a core are assigned to a
common NUMA node, irrespective of whether all those threads were online during
the topology update. To achieve this, we take care not to use cpu_sibling_mask()
since it is not hotplug invariant. Instead, we use cpu_first_sibling_thread()
and set up the mappings manually using the 'threads_per_core' value for that
particular platform. This helps us ensure that we don't hit this bug with any
combination of CPU hotplug and SMT mode switching.

Cc: stable@vger.kernel.org
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/topology.h | 10 +++++-
 arch/powerpc/mm/numa.c              | 70 +++++++++++++++++++++++++++++++++++--
 2 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 89e3ef2496ac..d0b5fca6b077 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -22,7 +22,15 @@ struct device_node;
 
 static inline int cpu_to_node(int cpu)
 {
-	return numa_cpu_lookup_table[cpu];
+	int nid;
+
+	nid = numa_cpu_lookup_table[cpu];
+
+	/*
+	 * During early boot, the numa-cpu lookup table might not have been
+	 * setup for all CPUs yet. In such cases, default to node 0.
+	 */
+	return (nid < 0) ? 0 : nid;
 }
 
 #define parent_node(node)	(node)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 078d3e00a616..6847d509162f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -31,6 +31,8 @@
 #include <asm/sparsemem.h>
 #include <asm/prom.h>
 #include <asm/smp.h>
+#include <asm/cputhreads.h>
+#include <asm/topology.h>
 #include <asm/firmware.h>
 #include <asm/paca.h>
 #include <asm/hvcall.h>
@@ -152,9 +154,22 @@ static void __init get_node_active_region(unsigned long pfn,
 	}
 }
 
-static void map_cpu_to_node(int cpu, int node)
+static void reset_numa_cpu_lookup_table(void)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		numa_cpu_lookup_table[cpu] = -1;
+}
+
+static void update_numa_cpu_lookup_table(unsigned int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
+}
+
+static void map_cpu_to_node(int cpu, int node)
+{
+	update_numa_cpu_lookup_table(cpu, node);
 
 	dbg("adding cpu %d to node %d\n", cpu, node);
 
@@ -522,11 +537,24 @@ static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
  */
 static int numa_setup_cpu(unsigned long lcpu)
 {
-	int nid = 0;
-	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
+	int nid;
+	struct device_node *cpu;
+
+	/*
+	 * If a valid cpu-to-node mapping is already available, use it
+	 * directly instead of querying the firmware, since it represents
+	 * the most recent mapping notified to us by the platform (eg: VPHN).
+	 */
+	if ((nid = numa_cpu_lookup_table[lcpu]) >= 0) {
+		map_cpu_to_node(lcpu, nid);
+		return nid;
+	}
+
+	cpu = of_get_cpu_node(lcpu, NULL);
 
 	if (!cpu) {
 		WARN_ON(1);
+		nid = 0;
 		goto out;
 	}
 
@@ -1067,6 +1095,7 @@ void __init do_init_bootmem(void)
 	 */
 	setup_node_to_cpumask_map();
 
+	reset_numa_cpu_lookup_table();
 	register_cpu_notifier(&ppc64_numa_nb);
 	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
 			  (void *)(unsigned long)boot_cpuid);
@@ -1445,6 +1474,33 @@ static int update_cpu_topology(void *data)
 	return 0;
 }
 
+static int update_lookup_table(void *data)
+{
+	struct topology_update_data *update;
+
+	if (!data)
+		return -EINVAL;
+
+	/*
+	 * Upon topology update, the numa-cpu lookup table needs to be updated
+	 * for all threads in the core, including offline CPUs, to ensure that
+	 * future hotplug operations respect the cpu-to-node associativity
+	 * properly.
+	 */
+	for (update = data; update; update = update->next) {
+		int nid, base, j;
+
+		nid = update->new_nid;
+		base = cpu_first_thread_sibling(update->cpu);
+
+		for (j = 0; j < threads_per_core; j++) {
+			update_numa_cpu_lookup_table(base + j, nid);
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Update the node maps and sysfs entries for each cpu whose home node
  * has changed. Returns 1 when the topology has changed, and 0 otherwise.
@@ -1513,6 +1569,14 @@ int arch_update_cpu_topology(void)
 
 	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);
 
+	/*
+	 * Update the numa-cpu lookup table with the new mappings, even for
+	 * offline CPUs. It is best to perform this update from the stop-
+	 * machine context.
+	 */
+	stop_machine(update_lookup_table, &updates[0],
+					cpumask_of(raw_smp_processor_id()));
+
 	for (ud = &updates[0]; ud; ud = ud->next) {
 		unregister_cpu_under_node(ud->cpu, ud->old_nid);
 		register_cpu_under_node(ud->cpu, ud->new_nid);
-- 
cgit v1.2.3


From 30c826358d10c1d6f8147de3310b97488daec830 Mon Sep 17 00:00:00 2001
From: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Date: Tue, 14 Jan 2014 15:45:09 +0530
Subject: Move precessing of MCE queued event out from syscall exit path.

Huge Dickins reported an issue that b5ff4211a829
"powerpc/book3s: Queue up and process delayed MCE events" breaks the
PowerMac G5 boot. This patch fixes it by moving the mce even processing
away from syscall exit, which was wrong to do that in first place, and
using irq work framework to delay processing of mce event.

Reported-by: Hugh Dickins <hughd@google.com
Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/mce.h |  1 -
 arch/powerpc/kernel/entry_64.S |  5 -----
 arch/powerpc/kernel/mce.c      | 13 ++++++++++---
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a2b8c7b35fba..8e99edf6d966 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -191,7 +191,6 @@ extern void save_mce_event(struct pt_regs *regs, long handled,
 extern int get_mce_event(struct machine_check_event *mce, bool release);
 extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
-extern void machine_check_process_queued_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt);
 extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 770d6d65c47b..bbfb0294b354 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -183,11 +183,6 @@ syscall_exit:
 #ifdef SHOW_SYSCALLS
 	bl	.do_show_syscall_exit
 	ld	r3,RESULT(r1)
-#endif
-#ifdef CONFIG_PPC_BOOK3S_64
-BEGIN_FTR_SECTION
-	bl	.machine_check_process_queued_event
-END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 #endif
 	CURRENT_THREAD_INFO(r12, r1)
 
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index c0c52ec1fca7..cadef7e64e42 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
 #include <linux/export.h>
+#include <linux/irq_work.h>
 #include <asm/mce.h>
 
 static DEFINE_PER_CPU(int, mce_nest_count);
@@ -35,6 +36,11 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
 static DEFINE_PER_CPU(int, mce_queue_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
 
+static void machine_check_process_queued_event(struct irq_work *work);
+struct irq_work mce_event_process_work = {
+        .func = machine_check_process_queued_event,
+};
+
 static void mce_set_error_info(struct machine_check_event *mce,
 			       struct mce_error_info *mce_err)
 {
@@ -185,17 +191,19 @@ void machine_check_queue_event(void)
 		return;
 	}
 	__get_cpu_var(mce_event_queue[index]) = evt;
+
+	/* Queue irq work to process this event later. */
+	irq_work_queue(&mce_event_process_work);
 }
 
 /*
  * process pending MCE event from the mce event queue. This function will be
  * called during syscall exit.
  */
-void machine_check_process_queued_event(void)
+static void machine_check_process_queued_event(struct irq_work *work)
 {
 	int index;
 
-	preempt_disable();
 	/*
 	 * For now just print it to console.
 	 * TODO: log this error event to FSP or nvram.
@@ -206,7 +214,6 @@ void machine_check_process_queued_event(void)
 				&__get_cpu_var(mce_event_queue[index]));
 		__get_cpu_var(mce_queue_count)--;
 	}
-	preempt_enable();
 }
 
 void machine_check_print_event_info(struct machine_check_event *evt)
-- 
cgit v1.2.3


From ae39c58c2e56209ae3503286ffe2338d5e9a4bed Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 13 Jan 2014 15:56:28 +1100
Subject: powerpc: Reclaim two unused thread_info flag bits

TIF_PERFMON_WORK and TIF_PERFMON_CTXSW are completely unused.  They
appear to be related to the old perfmon2 code, which has been
superseded by the perf_event infrastructure.  This removes their
definitions so that the bits can be used for other purposes.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/thread_info.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 9854c564ac52..fc2bf414c584 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -91,8 +91,6 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
 #define TIF_32BIT		4	/* 32 bit binary */
-#define TIF_PERFMON_WORK	5	/* work for pfm_handle_work() */
-#define TIF_PERFMON_CTXSW	6	/* perfmon needs ctxsw calls */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SINGLESTEP		8	/* singlestepping active */
 #define TIF_NOHZ		9	/* in adaptive nohz mode */
@@ -115,8 +113,6 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_32BIT		(1<<TIF_32BIT)
-#define _TIF_PERFMON_WORK	(1<<TIF_PERFMON_WORK)
-#define _TIF_PERFMON_CTXSW	(1<<TIF_PERFMON_CTXSW)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
-- 
cgit v1.2.3


From d31626f70b6103f4d9153b75d07e0e8795728cc9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 13 Jan 2014 15:56:29 +1100
Subject: powerpc: Don't corrupt transactional state when using FP/VMX in
 kernel

Currently, when we have a process using the transactional memory
facilities on POWER8 (that is, the processor is in transactional
or suspended state), and the process enters the kernel and the
kernel then uses the floating-point or vector (VMX/Altivec) facility,
we end up corrupting the user-visible FP/VMX/VSX state.  This
happens, for example, if a page fault causes a copy-on-write
operation, because the copy_page function will use VMX to do the
copy on POWER8.  The test program below demonstrates the bug.

The bug happens because when FP/VMX state for a transactional process
is stored in the thread_struct, we store the checkpointed state in
.fp_state/.vr_state and the transactional (current) state in
.transact_fp/.transact_vr.  However, when the kernel wants to use
FP/VMX, it calls enable_kernel_fp() or enable_kernel_altivec(),
which saves the current state in .fp_state/.vr_state.  Furthermore,
when we return to the user process we return with FP/VMX/VSX
disabled.  The next time the process uses FP/VMX/VSX, we don't know
which set of state (the current register values, .fp_state/.vr_state,
or .transact_fp/.transact_vr) we should be using, since we have no
way to tell if we are still in the same transaction, and if not,
whether the previous transaction succeeded or failed.

Thus it is necessary to strictly adhere to the rule that if FP has
been enabled at any point in a transaction, we must keep FP enabled
for the user process with the current transactional state in the
FP registers, until we detect that it is no longer in a transaction.
Similarly for VMX; once enabled it must stay enabled until the
process is no longer transactional.

In order to keep this rule, we add a new thread_info flag which we
test when returning from the kernel to userspace, called TIF_RESTORE_TM.
This flag indicates that there is FP/VMX/VSX state to be restored
before entering userspace, and when it is set the .tm_orig_msr field
in the thread_struct indicates what state needs to be restored.
The restoration is done by restore_tm_state().  The TIF_RESTORE_TM
bit is set by new giveup_fpu/altivec_maybe_transactional helpers,
which are called from enable_kernel_fp/altivec, giveup_vsx, and
flush_fp/altivec_to_thread instead of giveup_fpu/altivec.

The other thing to be done is to get the transactional FP/VMX/VSX
state from .fp_state/.vr_state when doing reclaim, if that state
has been saved there by giveup_fpu/altivec_maybe_transactional.
Having done this, we set the FP/VMX bit in the thread's MSR after
reclaim to indicate that that part of the state is now valid
(having been reclaimed from the processor's checkpointed state).

Finally, in the signal handling code, we move the clearing of the
transactional state bits in the thread's MSR a bit earlier, before
calling flush_fp_to_thread(), so that we don't unnecessarily set
the TIF_RESTORE_TM bit.

This is the test program:

/* Michael Neuling 4/12/2013
 *
 * See if the altivec state is leaked out of an aborted transaction due to
 * kernel vmx copy loops.
 *
 *   gcc -m64 htm_vmxcopy.c -o htm_vmxcopy
 *
 */

/* We don't use all of these, but for reference: */

int main(int argc, char *argv[])
{
	long double vecin = 1.3;
	long double vecout;
	unsigned long pgsize = getpagesize();
	int i;
	int fd;
	int size = pgsize*16;
	char tmpfile[] = "/tmp/page_faultXXXXXX";
	char buf[pgsize];
	char *a;
	uint64_t aborted = 0;

	fd = mkstemp(tmpfile);
	assert(fd >= 0);

	memset(buf, 0, pgsize);
	for (i = 0; i < size; i += pgsize)
		assert(write(fd, buf, pgsize) == pgsize);

	unlink(tmpfile);

	a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
	assert(a != MAP_FAILED);

	asm __volatile__(
		"lxvd2x 40,0,%[vecinptr] ; " // set 40 to initial value
		TBEGIN
		"beq	3f ;"
		TSUSPEND
		"xxlxor 40,40,40 ; " // set 40 to 0
		"std	5, 0(%[map]) ;" // cause kernel vmx copy page
		TABORT
		TRESUME
		TEND
		"li	%[res], 0 ;"
		"b	5f ;"
		"3: ;" // Abort handler
		"li	%[res], 1 ;"
		"5: ;"
		"stxvd2x 40,0,%[vecoutptr] ; "
		: [res]"=r"(aborted)
		: [vecinptr]"r"(&vecin),
		  [vecoutptr]"r"(&vecout),
		  [map]"r"(a)
		: "memory", "r0", "r3", "r4", "r5", "r6", "r7");

	if (aborted && (vecin != vecout)){
		printf("FAILED: vector state leaked on abort %f != %f\n",
		       (double)vecin, (double)vecout);
		exit(1);
	}

	munmap(a, size);

	close(fd);

	printf("PASSED!\n");
	return 0;
}

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/processor.h   |   2 +
 arch/powerpc/include/asm/thread_info.h |   5 +-
 arch/powerpc/include/asm/tm.h          |   1 +
 arch/powerpc/kernel/entry_64.S         |  12 ++-
 arch/powerpc/kernel/fpu.S              |  16 ++++
 arch/powerpc/kernel/process.c          | 146 ++++++++++++++++++++++++++++++---
 arch/powerpc/kernel/signal.c           |   3 +-
 arch/powerpc/kernel/signal_32.c        |  21 ++---
 arch/powerpc/kernel/signal_64.c        |  14 ++--
 arch/powerpc/kernel/traps.c            |  12 +--
 arch/powerpc/kernel/vector.S           |  10 +++
 11 files changed, 195 insertions(+), 47 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index fc14a38c7ccf..232a2fa5b483 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -373,6 +373,8 @@ extern int set_endian(struct task_struct *tsk, unsigned int val);
 extern int get_unalign_ctl(struct task_struct *tsk, unsigned long adr);
 extern int set_unalign_ctl(struct task_struct *tsk, unsigned int val);
 
+extern void fp_enable(void);
+extern void vec_enable(void);
 extern void load_fp_state(struct thread_fp_state *fp);
 extern void store_fp_state(struct thread_fp_state *fp);
 extern void load_vr_state(struct thread_vr_state *vr);
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index fc2bf414c584..b034ecdb7c74 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -91,6 +91,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_POLLING_NRFLAG	3	/* true if poll_idle() is polling
 					   TIF_NEED_RESCHED */
 #define TIF_32BIT		4	/* 32 bit binary */
+#define TIF_RESTORE_TM		5	/* need to restore TM FP/VEC/VSX */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SINGLESTEP		8	/* singlestepping active */
 #define TIF_NOHZ		9	/* in adaptive nohz mode */
@@ -113,6 +114,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_32BIT		(1<<TIF_32BIT)
+#define _TIF_RESTORE_TM		(1<<TIF_RESTORE_TM)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
@@ -128,7 +130,8 @@ static inline struct thread_info *current_thread_info(void)
 				 _TIF_NOHZ)
 
 #define _TIF_USER_WORK_MASK	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | \
-				 _TIF_NOTIFY_RESUME | _TIF_UPROBE)
+				 _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+				 _TIF_RESTORE_TM)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
 
 /* Bits in local_flags */
diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h
index 9dfbc34bdbf5..0c9f8b74dd97 100644
--- a/arch/powerpc/include/asm/tm.h
+++ b/arch/powerpc/include/asm/tm.h
@@ -15,6 +15,7 @@ extern void do_load_up_transact_altivec(struct thread_struct *thread);
 extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
 		       unsigned long orig_msr, uint8_t cause);
+extern void tm_reclaim_current(uint8_t cause);
 extern void tm_recheckpoint(struct thread_struct *thread,
 			    unsigned long orig_msr);
 extern void tm_abort(uint8_t cause);
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index bbfb0294b354..662c6dd98072 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -664,8 +664,16 @@ _GLOBAL(ret_from_except_lite)
 	bl	.restore_interrupts
 	SCHEDULE_USER
 	b	.ret_from_except_lite
-
-2:	bl	.save_nvgprs
+2:
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	andi.	r0,r4,_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM
+	bne	3f		/* only restore TM if nothing else to do */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.restore_tm_state
+	b	restore
+3:
+#endif
+	bl	.save_nvgprs
 	bl	.restore_interrupts
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	.do_notify_resume
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index f7f5b8bed68f..9ad236e5d2c9 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -80,6 +80,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	blr
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
+/*
+ * Enable use of the FPU, and VSX if possible, for the caller.
+ */
+_GLOBAL(fp_enable)
+	mfmsr	r3
+	ori	r3,r3,MSR_FP
+#ifdef CONFIG_VSX
+BEGIN_FTR_SECTION
+	oris	r3,r3,MSR_VSX@h
+END_FTR_SECTION_IFSET(CPU_FTR_VSX)
+#endif
+	SYNC
+	MTMSRD(r3)
+	isync			/* (not necessary for arch 2.02 and later) */
+	blr
+
 /*
  * Load state from memory into FP registers including FPSCR.
  * Assumes the caller has enabled FP in the MSR.
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index bf8e136316e2..3573d186505f 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -73,6 +73,48 @@ struct task_struct *last_task_used_vsx = NULL;
 struct task_struct *last_task_used_spe = NULL;
 #endif
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+void giveup_fpu_maybe_transactional(struct task_struct *tsk)
+{
+	/*
+	 * If we are saving the current thread's registers, and the
+	 * thread is in a transactional state, set the TIF_RESTORE_TM
+	 * bit so that we know to restore the registers before
+	 * returning to userspace.
+	 */
+	if (tsk == current && tsk->thread.regs &&
+	    MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
+	    !test_thread_flag(TIF_RESTORE_TM)) {
+		tsk->thread.tm_orig_msr = tsk->thread.regs->msr;
+		set_thread_flag(TIF_RESTORE_TM);
+	}
+
+	giveup_fpu(tsk);
+}
+
+void giveup_altivec_maybe_transactional(struct task_struct *tsk)
+{
+	/*
+	 * If we are saving the current thread's registers, and the
+	 * thread is in a transactional state, set the TIF_RESTORE_TM
+	 * bit so that we know to restore the registers before
+	 * returning to userspace.
+	 */
+	if (tsk == current && tsk->thread.regs &&
+	    MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
+	    !test_thread_flag(TIF_RESTORE_TM)) {
+		tsk->thread.tm_orig_msr = tsk->thread.regs->msr;
+		set_thread_flag(TIF_RESTORE_TM);
+	}
+
+	giveup_altivec(tsk);
+}
+
+#else
+#define giveup_fpu_maybe_transactional(tsk)	giveup_fpu(tsk)
+#define giveup_altivec_maybe_transactional(tsk)	giveup_altivec(tsk)
+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
 #ifdef CONFIG_PPC_FPU
 /*
  * Make sure the floating-point register state in the
@@ -101,13 +143,13 @@ void flush_fp_to_thread(struct task_struct *tsk)
 			 */
 			BUG_ON(tsk != current);
 #endif
-			giveup_fpu(tsk);
+			giveup_fpu_maybe_transactional(tsk);
 		}
 		preempt_enable();
 	}
 }
 EXPORT_SYMBOL_GPL(flush_fp_to_thread);
-#endif
+#endif /* CONFIG_PPC_FPU */
 
 void enable_kernel_fp(void)
 {
@@ -115,11 +157,11 @@ void enable_kernel_fp(void)
 
 #ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_FP))
-		giveup_fpu(current);
+		giveup_fpu_maybe_transactional(current);
 	else
 		giveup_fpu(NULL);	/* just enables FP for kernel */
 #else
-	giveup_fpu(last_task_used_math);
+	giveup_fpu_maybe_transactional(last_task_used_math);
 #endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_fp);
@@ -131,11 +173,11 @@ void enable_kernel_altivec(void)
 
 #ifdef CONFIG_SMP
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC))
-		giveup_altivec(current);
+		giveup_altivec_maybe_transactional(current);
 	else
 		giveup_altivec_notask();
 #else
-	giveup_altivec(last_task_used_altivec);
+	giveup_altivec_maybe_transactional(last_task_used_altivec);
 #endif /* CONFIG_SMP */
 }
 EXPORT_SYMBOL(enable_kernel_altivec);
@@ -152,7 +194,7 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 #ifdef CONFIG_SMP
 			BUG_ON(tsk != current);
 #endif
-			giveup_altivec(tsk);
+			giveup_altivec_maybe_transactional(tsk);
 		}
 		preempt_enable();
 	}
@@ -181,8 +223,8 @@ EXPORT_SYMBOL(enable_kernel_vsx);
 
 void giveup_vsx(struct task_struct *tsk)
 {
-	giveup_fpu(tsk);
-	giveup_altivec(tsk);
+	giveup_fpu_maybe_transactional(tsk);
+	giveup_altivec_maybe_transactional(tsk);
 	__giveup_vsx(tsk);
 }
 
@@ -478,7 +520,48 @@ static inline bool hw_brk_match(struct arch_hw_breakpoint *a,
 		return false;
 	return true;
 }
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static void tm_reclaim_thread(struct thread_struct *thr,
+			      struct thread_info *ti, uint8_t cause)
+{
+	unsigned long msr_diff = 0;
+
+	/*
+	 * If FP/VSX registers have been already saved to the
+	 * thread_struct, move them to the transact_fp array.
+	 * We clear the TIF_RESTORE_TM bit since after the reclaim
+	 * the thread will no longer be transactional.
+	 */
+	if (test_ti_thread_flag(ti, TIF_RESTORE_TM)) {
+		msr_diff = thr->tm_orig_msr & ~thr->regs->msr;
+		if (msr_diff & MSR_FP)
+			memcpy(&thr->transact_fp, &thr->fp_state,
+			       sizeof(struct thread_fp_state));
+		if (msr_diff & MSR_VEC)
+			memcpy(&thr->transact_vr, &thr->vr_state,
+			       sizeof(struct thread_vr_state));
+		clear_ti_thread_flag(ti, TIF_RESTORE_TM);
+		msr_diff &= MSR_FP | MSR_VEC | MSR_VSX | MSR_FE0 | MSR_FE1;
+	}
+
+	tm_reclaim(thr, thr->regs->msr, cause);
+
+	/* Having done the reclaim, we now have the checkpointed
+	 * FP/VSX values in the registers.  These might be valid
+	 * even if we have previously called enable_kernel_fp() or
+	 * flush_fp_to_thread(), so update thr->regs->msr to
+	 * indicate their current validity.
+	 */
+	thr->regs->msr |= msr_diff;
+}
+
+void tm_reclaim_current(uint8_t cause)
+{
+	tm_enable();
+	tm_reclaim_thread(&current->thread, current_thread_info(), cause);
+}
+
 static inline void tm_reclaim_task(struct task_struct *tsk)
 {
 	/* We have to work out if we're switching from/to a task that's in the
@@ -501,9 +584,11 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 
 	/* Stash the original thread MSR, as giveup_fpu et al will
 	 * modify it.  We hold onto it to see whether the task used
-	 * FP & vector regs.
+	 * FP & vector regs.  If the TIF_RESTORE_TM flag is set,
+	 * tm_orig_msr is already set.
 	 */
-	thr->tm_orig_msr = thr->regs->msr;
+	if (!test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_TM))
+		thr->tm_orig_msr = thr->regs->msr;
 
 	TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
 		 "ccr=%lx, msr=%lx, trap=%lx)\n",
@@ -511,7 +596,7 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 		 thr->regs->ccr, thr->regs->msr,
 		 thr->regs->trap);
 
-	tm_reclaim(thr, thr->regs->msr, TM_CAUSE_RESCHED);
+	tm_reclaim_thread(thr, task_thread_info(tsk), TM_CAUSE_RESCHED);
 
 	TM_DEBUG("--- tm_reclaim on pid %d complete\n",
 		 tsk->pid);
@@ -587,6 +672,43 @@ static inline void __switch_to_tm(struct task_struct *prev)
 		tm_reclaim_task(prev);
 	}
 }
+
+/*
+ * This is called if we are on the way out to userspace and the
+ * TIF_RESTORE_TM flag is set.  It checks if we need to reload
+ * FP and/or vector state and does so if necessary.
+ * If userspace is inside a transaction (whether active or
+ * suspended) and FP/VMX/VSX instructions have ever been enabled
+ * inside that transaction, then we have to keep them enabled
+ * and keep the FP/VMX/VSX state loaded while ever the transaction
+ * continues.  The reason is that if we didn't, and subsequently
+ * got a FP/VMX/VSX unavailable interrupt inside a transaction,
+ * we don't know whether it's the same transaction, and thus we
+ * don't know which of the checkpointed state and the transactional
+ * state to use.
+ */
+void restore_tm_state(struct pt_regs *regs)
+{
+	unsigned long msr_diff;
+
+	clear_thread_flag(TIF_RESTORE_TM);
+	if (!MSR_TM_ACTIVE(regs->msr))
+		return;
+
+	msr_diff = current->thread.tm_orig_msr & ~regs->msr;
+	msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
+	if (msr_diff & MSR_FP) {
+		fp_enable();
+		load_fp_state(&current->thread.fp_state);
+		regs->msr |= current->thread.fpexc_mode;
+	}
+	if (msr_diff & MSR_VEC) {
+		vec_enable();
+		load_vr_state(&current->thread.vr_state);
+	}
+	regs->msr |= msr_diff;
+}
+
 #else
 #define tm_recheckpoint_new_task(new)
 #define __switch_to_tm(prev)
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index 457e97aa2945..8fc4177ed65a 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -203,8 +203,7 @@ unsigned long get_tm_stackpointer(struct pt_regs *regs)
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	if (MSR_TM_ACTIVE(regs->msr)) {
-		tm_enable();
-		tm_reclaim(&current->thread, regs->msr, TM_CAUSE_SIGNAL);
+		tm_reclaim_current(TM_CAUSE_SIGNAL);
 		if (MSR_TM_TRANSACTIONAL(regs->msr))
 			return current->thread.ckpt_regs.gpr[1];
 	}
diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
index 68027bfa5f8e..6ce69e6f1fcb 100644
--- a/arch/powerpc/kernel/signal_32.c
+++ b/arch/powerpc/kernel/signal_32.c
@@ -519,6 +519,13 @@ static int save_tm_user_regs(struct pt_regs *regs,
 {
 	unsigned long msr = regs->msr;
 
+	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
+	 * just indicates to userland that we were doing a transaction, but we
+	 * don't want to return in transactional state.  This also ensures
+	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
+	 */
+	regs->msr &= ~MSR_TS_MASK;
+
 	/* Make sure floating point registers are stored in regs */
 	flush_fp_to_thread(current);
 
@@ -1056,13 +1063,6 @@ int handle_rt_signal32(unsigned long sig, struct k_sigaction *ka,
 	/* enter the signal handler in native-endian mode */
 	regs->msr &= ~MSR_LE;
 	regs->msr |= (MSR_KERNEL & MSR_LE);
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state:
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-#endif
 	return 1;
 
 badframe:
@@ -1484,13 +1484,6 @@ int handle_signal32(unsigned long sig, struct k_sigaction *ka,
 	regs->nip = (unsigned long) ka->sa.sa_handler;
 	/* enter the signal handler in big-endian mode */
 	regs->msr &= ~MSR_LE;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state:
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-#endif
 	return 1;
 
 badframe:
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index 42991045349f..e35bf773df7a 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -192,6 +192,13 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 
 	BUG_ON(!MSR_TM_ACTIVE(regs->msr));
 
+	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
+	 * just indicates to userland that we were doing a transaction, but we
+	 * don't want to return in transactional state.  This also ensures
+	 * that flush_fp_to_thread won't set TIF_RESTORE_TM again.
+	 */
+	regs->msr &= ~MSR_TS_MASK;
+
 	flush_fp_to_thread(current);
 
 #ifdef CONFIG_ALTIVEC
@@ -749,13 +756,6 @@ int handle_rt_signal64(int signr, struct k_sigaction *ka, siginfo_t *info,
 
 	/* Make sure signal handler doesn't get spurious FP exceptions */
 	current->thread.fp_state.fpscr = 0;
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
-	 * just indicates to userland that we were doing a transaction, but we
-	 * don't want to return in transactional state:
-	 */
-	regs->msr &= ~MSR_TS_MASK;
-#endif
 
 	/* Set up to return from userspace. */
 	if (vdso64_rt_sigtramp && current->mm->context.vdso_base) {
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 330841766b09..26e1358ff0bc 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1399,7 +1399,6 @@ void fp_unavailable_tm(struct pt_regs *regs)
 
 	TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n",
 		 regs->nip, regs->msr);
-	tm_enable();
 
         /* We can only have got here if the task started using FP after
          * beginning the transaction.  So, the transactional regs are just a
@@ -1408,8 +1407,7 @@ void fp_unavailable_tm(struct pt_regs *regs)
          * transaction, and probably retry but now with FP enabled.  So the
          * checkpointed FP registers need to be loaded.
 	 */
-	tm_reclaim(&current->thread, current->thread.regs->msr,
-		   TM_CAUSE_FAC_UNAV);
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 	/* Reclaim didn't save out any FPRs to transact_fprs. */
 
 	/* Enable FP for the task: */
@@ -1432,9 +1430,7 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 	TM_DEBUG("Vector Unavailable trap whilst transactional at 0x%lx,"
 		 "MSR=%lx\n",
 		 regs->nip, regs->msr);
-	tm_enable();
-	tm_reclaim(&current->thread, current->thread.regs->msr,
-		   TM_CAUSE_FAC_UNAV);
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 	regs->msr |= MSR_VEC;
 	tm_recheckpoint(&current->thread, regs->msr);
 	current->thread.used_vr = 1;
@@ -1455,10 +1451,8 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 		 "MSR=%lx\n",
 		 regs->nip, regs->msr);
 
-	tm_enable();
 	/* This reclaims FP and/or VR regs if they're already enabled */
-	tm_reclaim(&current->thread, current->thread.regs->msr,
-		   TM_CAUSE_FAC_UNAV);
+	tm_reclaim_current(TM_CAUSE_FAC_UNAV);
 
 	regs->msr |= MSR_VEC | MSR_FP | current->thread.fpexc_mode |
 		MSR_VSX;
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 0458a9aaba9d..74f8050518d6 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -36,6 +36,16 @@ _GLOBAL(do_load_up_transact_altivec)
 	blr
 #endif
 
+/*
+ * Enable use of VMX/Altivec for the caller.
+ */
+_GLOBAL(vec_enable)
+	mfmsr	r3
+	oris	r3,r3,MSR_VEC@h
+	MTMSRD(r3)
+	isync
+	blr
+
 /*
  * Load state from memory into VMX registers including VSCR.
  * Assumes the caller has enabled VMX in the MSR.
-- 
cgit v1.2.3


From b3084f4db3aeb991c507ca774337c7e7893ed04f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Mon, 13 Jan 2014 11:34:24 +0530
Subject: powerpc/thp: Fix crash on mremap

This patch fix the below crash

NIP [c00000000004cee4] .__hash_page_thp+0x2a4/0x440
LR [c0000000000439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c000000736103c40] [00001ffffb000000] 0x1ffffb000000(unreliable)
[437908.479693] [c000000736103d50] [c0000000000439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c000000736103e30] [c00000000000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++++++++++++++
 include/asm-generic/pgtable.h            | 12 ++++++++++++
 mm/huge_memory.c                         | 14 +++++---------
 3 files changed, 31 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index 4a191c472867..d27960c89a71 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -558,5 +558,19 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #define __HAVE_ARCH_PMDP_INVALIDATE
 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 			    pmd_t *pmdp);
+
+#define pmd_move_must_withdraw pmd_move_must_withdraw
+typedef struct spinlock spinlock_t;
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+					 spinlock_t *old_pmd_ptl)
+{
+	/*
+	 * Archs like ppc64 use pgtable to store per pmd
+	 * specific information. So when we switch the pmd,
+	 * we should also withdraw and deposit the pgtable
+	 */
+	return true;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index db0923458940..8e4f41d9af4d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+					 spinlock_t *old_pmd_ptl)
+{
+	/*
+	 * With split pmd lock we also need to move preallocated
+	 * PTE page table if new_pmd is on different PMD page table.
+	 */
+	return new_pmd_ptl != old_pmd_ptl;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95d1acb0f3d2..5d80c53b87cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 			spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 		pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));
-		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-		if (new_ptl != old_ptl) {
-			pgtable_t pgtable;
 
-			/*
-			 * Move preallocated PTE page table if new_pmd is on
-			 * different PMD page table.
-			 */
+		if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+			pgtable_t pgtable;
 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
-			spin_unlock(new_ptl);
 		}
+		set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+		if (new_ptl != old_ptl)
+			spin_unlock(new_ptl);
 		spin_unlock(old_ptl);
 	}
 out:
-- 
cgit v1.2.3


From 7e4e7867b1e551b7b8f326da3604c47332972bc6 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Wed, 15 Jan 2014 13:16:11 +0800
Subject: powerpc/eeh: Handle multiple EEH errors

For one PCI error relevant OPAL event, we possibly have multiple
EEH errors for that. For example, multiple frozen PEs detected on
different PHBs. Unfortunately, we didn't cover the case. The patch
enumarates the return value from eeh_ops::next_error() and change
eeh_handle_special_event() and eeh_ops::next_error() to handle all
existing EEH errors.

As Ben pointed out, we needn't list_for_each_entry_safe() since we
are not deleting any PHB from the hose_list and the EEH serialized
lock should be held while purging EEH events. The patch covers those
suggestions as well.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/eeh.h            |  10 ++
 arch/powerpc/kernel/eeh_driver.c          | 150 ++++++++++++++++--------------
 arch/powerpc/platforms/powernv/eeh-ioda.c |  39 +++++---
 3 files changed, 112 insertions(+), 87 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8b4b8e4a5c32..9e39ceb1d19f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -118,6 +118,16 @@ static inline struct pci_dev *eeh_dev_to_pci_dev(struct eeh_dev *edev)
 	return edev ? edev->pdev : NULL;
 }
 
+/* Return values from eeh_ops::next_error */
+enum {
+	EEH_NEXT_ERR_NONE = 0,
+	EEH_NEXT_ERR_INF,
+	EEH_NEXT_ERR_FROZEN_PE,
+	EEH_NEXT_ERR_FENCED_PHB,
+	EEH_NEXT_ERR_DEAD_PHB,
+	EEH_NEXT_ERR_DEAD_IOC
+};
+
 /*
  * The struct is used to trace the registered EEH operation
  * callback functions. Actually, those operation callback
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7db39203a073..baa3b06e6dab 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -630,84 +630,90 @@ static void eeh_handle_special_event(void)
 {
 	struct eeh_pe *pe, *phb_pe;
 	struct pci_bus *bus;
-	struct pci_controller *hose, *tmp;
+	struct pci_controller *hose;
 	unsigned long flags;
-	int rc = 0;
+	int rc;
 
-	/*
-	 * The return value from next_error() has been classified as follows.
-	 * It might be good to enumerate them. However, next_error() is only
-	 * supported by PowerNV platform for now. So it would be fine to use
-	 * integer directly:
-	 *
-	 * 4 - Dead IOC           3 - Dead PHB
-	 * 2 - Fenced PHB         1 - Frozen PE
-	 * 0 - No error found
-	 *
-	 */
-	rc = eeh_ops->next_error(&pe);
-	if (rc <= 0)
-		return;
 
-	switch (rc) {
-	case 4:
-		/* Mark all PHBs in dead state */
-		eeh_serialize_lock(&flags);
-		list_for_each_entry_safe(hose, tmp,
-				&hose_list, list_node) {
-			phb_pe = eeh_phb_pe_get(hose);
-			if (!phb_pe) continue;
-
-			eeh_pe_state_mark(phb_pe,
-				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+	do {
+		rc = eeh_ops->next_error(&pe);
+
+		switch (rc) {
+		case EEH_NEXT_ERR_DEAD_IOC:
+			/* Mark all PHBs in dead state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events */
+			eeh_remove_event(NULL);
+
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe) continue;
+
+				eeh_pe_state_mark(phb_pe,
+					EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+			}
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_FROZEN_PE:
+		case EEH_NEXT_ERR_FENCED_PHB:
+		case EEH_NEXT_ERR_DEAD_PHB:
+			/* Mark the PE in fenced state */
+			eeh_serialize_lock(&flags);
+
+			/* Purge all events of the PHB */
+			eeh_remove_event(pe);
+
+			if (rc == EEH_NEXT_ERR_DEAD_PHB)
+				eeh_pe_state_mark(pe,
+					EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
+			else
+				eeh_pe_state_mark(pe,
+					EEH_PE_ISOLATED | EEH_PE_RECOVERING);
+
+			eeh_serialize_unlock(flags);
+
+			break;
+		case EEH_NEXT_ERR_NONE:
+			return;
+		default:
+			pr_warn("%s: Invalid value %d from next_error()\n",
+				__func__, rc);
+			return;
 		}
-		eeh_serialize_unlock(flags);
-
-		/* Purge all events */
-		eeh_remove_event(NULL);
-		break;
-	case 3:
-	case 2:
-	case 1:
-		/* Mark the PE in fenced state */
-		eeh_serialize_lock(&flags);
-		if (rc == 3)
-			eeh_pe_state_mark(pe,
-				EEH_PE_ISOLATED | EEH_PE_PHB_DEAD);
-		else
-			eeh_pe_state_mark(pe,
-				EEH_PE_ISOLATED | EEH_PE_RECOVERING);
-		eeh_serialize_unlock(flags);
-
-		/* Purge all events of the PHB */
-		eeh_remove_event(pe);
-		break;
-	default:
-		pr_err("%s: Invalid value %d from next_error()\n",
-		       __func__, rc);
-		return;
-	}
 
-	/*
-	 * For fenced PHB and frozen PE, it's handled as normal
-	 * event. We have to remove the affected PHBs for dead
-	 * PHB and IOC
-	 */
-	if (rc == 2 || rc == 1)
-		eeh_handle_normal_event(pe);
-	else {
-		list_for_each_entry_safe(hose, tmp,
-			&hose_list, list_node) {
-			phb_pe = eeh_phb_pe_get(hose);
-			if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD))
-				continue;
-
-			bus = eeh_pe_bus_get(phb_pe);
-			/* Notify all devices that they're about to go down. */
-			eeh_pe_dev_traverse(pe, eeh_report_failure, NULL);
-			pcibios_remove_pci_devices(bus);
+		/*
+		 * For fenced PHB and frozen PE, it's handled as normal
+		 * event. We have to remove the affected PHBs for dead
+		 * PHB and IOC
+		 */
+		if (rc == EEH_NEXT_ERR_FROZEN_PE ||
+		    rc == EEH_NEXT_ERR_FENCED_PHB) {
+			eeh_handle_normal_event(pe);
+		} else {
+			list_for_each_entry(hose, &hose_list, list_node) {
+				phb_pe = eeh_phb_pe_get(hose);
+				if (!phb_pe ||
+				    !(phb_pe->state & EEH_PE_PHB_DEAD))
+					continue;
+
+				/* Notify all devices to be down */
+				bus = eeh_pe_bus_get(phb_pe);
+				eeh_pe_dev_traverse(pe,
+					eeh_report_failure, NULL);
+				pcibios_remove_pci_devices(bus);
+			}
 		}
-	}
+
+		/*
+		 * If we have detected dead IOC, we needn't proceed
+		 * any more since all PHBs would have been removed
+		 */
+		if (rc == EEH_NEXT_ERR_DEAD_IOC)
+			break;
+	} while (rc != EEH_NEXT_ERR_NONE);
 }
 
 /**
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c
index 007ac4989841..8dd16f4675a2 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -735,12 +735,12 @@ static int ioda_eeh_get_pe(struct pci_controller *hose,
  */
 static int ioda_eeh_next_error(struct eeh_pe **pe)
 {
-	struct pci_controller *hose, *tmp;
+	struct pci_controller *hose;
 	struct pnv_phb *phb;
 	u64 frozen_pe_no;
 	u16 err_type, severity;
 	long rc;
-	int ret = 1;
+	int ret = EEH_NEXT_ERR_NONE;
 
 	/*
 	 * While running here, it's safe to purge the event queue.
@@ -750,7 +750,7 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 	eeh_remove_event(NULL);
 	opal_notifier_update_evt(OPAL_EVENT_PCI_ERROR, 0x0ul);
 
-	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+	list_for_each_entry(hose, &hose_list, list_node) {
 		/*
 		 * If the subordinate PCI buses of the PHB has been
 		 * removed, we needn't take care of it any more.
@@ -789,19 +789,19 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 		switch (err_type) {
 		case OPAL_EEH_IOC_ERROR:
 			if (severity == OPAL_EEH_SEV_IOC_DEAD) {
-				list_for_each_entry_safe(hose, tmp,
-						&hose_list, list_node) {
+				list_for_each_entry(hose, &hose_list,
+						    list_node) {
 					phb = hose->private_data;
 					phb->eeh_state |= PNV_EEH_STATE_REMOVED;
 				}
 
 				pr_err("EEH: dead IOC detected\n");
-				ret = 4;
-				goto out;
+				ret = EEH_NEXT_ERR_DEAD_IOC;
 			} else if (severity == OPAL_EEH_SEV_INF) {
 				pr_info("EEH: IOC informative error "
 					"detected\n");
 				ioda_eeh_hub_diag(hose);
+				ret = EEH_NEXT_ERR_NONE;
 			}
 
 			break;
@@ -813,21 +813,20 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 				pr_err("EEH: dead PHB#%x detected\n",
 					hose->global_number);
 				phb->eeh_state |= PNV_EEH_STATE_REMOVED;
-				ret = 3;
-				goto out;
+				ret = EEH_NEXT_ERR_DEAD_PHB;
 			} else if (severity == OPAL_EEH_SEV_PHB_FENCED) {
 				if (ioda_eeh_get_phb_pe(hose, pe))
 					break;
 
 				pr_err("EEH: fenced PHB#%x detected\n",
 					hose->global_number);
-				ret = 2;
-				goto out;
+				ret = EEH_NEXT_ERR_FENCED_PHB;
 			} else if (severity == OPAL_EEH_SEV_INF) {
 				pr_info("EEH: PHB#%x informative error "
 					"detected\n",
 					hose->global_number);
 				ioda_eeh_phb_diag(hose);
+				ret = EEH_NEXT_ERR_NONE;
 			}
 
 			break;
@@ -837,13 +836,23 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
 
 			pr_err("EEH: Frozen PE#%x on PHB#%x detected\n",
 				(*pe)->addr, (*pe)->phb->global_number);
-			ret = 1;
-			goto out;
+			ret = EEH_NEXT_ERR_FROZEN_PE;
+			break;
+		default:
+			pr_warn("%s: Unexpected error type %d\n",
+				__func__, err_type);
 		}
+
+		/*
+		 * If we have no errors on the specific PHB or only
+		 * informative error there, we continue poking it.
+		 * Otherwise, we need actions to be taken by upper
+		 * layer.
+		 */
+		if (ret > EEH_NEXT_ERR_INF)
+			break;
 	}
 
-	ret = 0;
-out:
 	return ret;
 }
 
-- 
cgit v1.2.3


From f7d98d18a01ece2863984d4fb5ae949b18b02715 Mon Sep 17 00:00:00 2001
From: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Date: Wed, 15 Jan 2014 17:02:04 +1100
Subject: powerpc/powernv: Call OPAL sync before kexec'ing

Its possible that OPAL may be writing to host memory during
kexec (like dump retrieve scenario). In this situation we might
end up corrupting host memory.

This patch makes OPAL sync call to make sure OPAL stops
writing to host memory before kexec'ing.

Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/opal.h                |  2 ++
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 arch/powerpc/platforms/powernv/opal.c          | 16 ++++++++++++++++
 arch/powerpc/platforms/powernv/setup.c         |  6 ++++--
 4 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 9a87b4401a41..40157e2ca691 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -156,6 +156,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
 #define OPAL_FLASH_UPDATE			78
 #define OPAL_GET_MSG				85
 #define OPAL_CHECK_ASYNC_COMPLETION		86
+#define OPAL_SYNC_HOST_REBOOT			87
 
 #ifndef __ASSEMBLY__
 
@@ -828,6 +829,7 @@ int64_t opal_update_flash(uint64_t blk_list);
 
 int64_t opal_get_msg(uint64_t buffer, size_t size);
 int64_t opal_check_completion(uint64_t buffer, size_t size, uint64_t token);
+int64_t opal_sync_host_reboot(void);
 
 /* Internal functions */
 extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data);
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 719aa5c325c6..3e8829c40fbb 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -128,3 +128,4 @@ OPAL_CALL(opal_manage_flash,			OPAL_FLASH_MANAGE);
 OPAL_CALL(opal_update_flash,			OPAL_FLASH_UPDATE);
 OPAL_CALL(opal_get_msg,				OPAL_GET_MSG);
 OPAL_CALL(opal_check_completion,		OPAL_CHECK_ASYNC_COMPLETION);
+OPAL_CALL(opal_sync_host_reboot,		OPAL_SYNC_HOST_REBOOT);
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 7a184a0ff183..65499adaecff 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -20,6 +20,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/kobject.h>
+#include <linux/delay.h>
 #include <asm/opal.h>
 #include <asm/firmware.h>
 #include <asm/mce.h>
@@ -482,10 +483,25 @@ subsys_initcall(opal_init);
 void opal_shutdown(void)
 {
 	unsigned int i;
+	long rc = OPAL_BUSY;
 
+	/* First free interrupts, which will also mask them */
 	for (i = 0; i < opal_irq_count; i++) {
 		if (opal_irqs[i])
 			free_irq(opal_irqs[i], NULL);
 		opal_irqs[i] = 0;
 	}
+
+	/*
+	 * Then sync with OPAL which ensure anything that can
+	 * potentially write to our memory has completed such
+	 * as an ongoing dump retrieval
+	 */
+	while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) {
+		rc = opal_sync_host_reboot();
+		if (rc == OPAL_BUSY)
+			opal_poll_events(NULL);
+		else
+			mdelay(10);
+	}
 }
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index 19884b2a51b4..a932feb2901c 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -145,8 +145,10 @@ static void pnv_shutdown(void)
 	/* Let the PCI code clear up IODA tables */
 	pnv_pci_shutdown();
 
-	/* And unregister all OPAL interrupts so they don't fire
-	 * up while we kexec
+	/*
+	 * Stop OPAL activity: Unregister all OPAL interrupts so they
+	 * don't fire up while we kexec and make sure all potentially
+	 * DMA'ing ops are complete (such as dump retrieval).
 	 */
 	opal_shutdown();
 }
-- 
cgit v1.2.3


From 9494a1e8428ea8e60ae77b221b9d32a5edc21ef4 Mon Sep 17 00:00:00 2001
From: Mark Salter <msalter@redhat.com>
Date: Thu, 23 Jan 2014 15:53:55 -0800
Subject: powerpc: use generic fixmap.h

Signed-off-by: Mark Salter <msalter@redhat.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/fixmap.h | 44 ++-------------------------------------
 1 file changed, 2 insertions(+), 42 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index 5c2c0233175e..90f604bbcd19 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -58,52 +58,12 @@ enum fixed_addresses {
 extern void __set_fixmap (enum fixed_addresses idx,
 					phys_addr_t phys, pgprot_t flags);
 
-#define set_fixmap(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL)
-/*
- * Some hardware wants to get fixmapped without caching.
- */
-#define set_fixmap_nocache(idx, phys) \
-		__set_fixmap(idx, phys, PAGE_KERNEL_NCG)
-
-#define clear_fixmap(idx) \
-		__set_fixmap(idx, 0, __pgprot(0))
-
 #define __FIXADDR_SIZE	(__end_of_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_START		(FIXADDR_TOP - __FIXADDR_SIZE)
 
-#define __fix_to_virt(x)	(FIXADDR_TOP - ((x) << PAGE_SHIFT))
-#define __virt_to_fix(x)	((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
-
-extern void __this_fixmap_does_not_exist(void);
-
-/*
- * 'index to address' translation. If anyone tries to use the idx
- * directly without tranlation, we catch the bug with a NULL-deference
- * kernel oops. Illegal ranges of incoming indices are caught too.
- */
-static __always_inline unsigned long fix_to_virt(const unsigned int idx)
-{
-	/*
-	 * this branch gets completely eliminated after inlining,
-	 * except when someone tries to use fixaddr indices in an
-	 * illegal way. (such as mixing up address types or using
-	 * out-of-range indices).
-	 *
-	 * If it doesn't get removed, the linker will complain
-	 * loudly with a reasonably clear error message..
-	 */
-	if (idx >= __end_of_fixed_addresses)
-		__this_fixmap_does_not_exist();
-
-        return __fix_to_virt(idx);
-}
+#define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NCG
 
-static inline unsigned long virt_to_fix(const unsigned long vaddr)
-{
-	BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
-	return __virt_to_fix(vaddr);
-}
+#include <asm-generic/fixmap.h>
 
 #endif /* !__ASSEMBLY__ */
 #endif
-- 
cgit v1.2.3


From 736017752d2f6ed0d64f5e15cf48e79779b11c85 Mon Sep 17 00:00:00 2001
From: Cédric Le Goater <clg@fr.ibm.com>
Date: Thu, 9 Jan 2014 11:51:16 +0100
Subject: KVM: PPC: Book3S: MMIO emulation support for little endian guests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MMIO emulation reads the last instruction executed by the guest
and then emulates. If the guest is running in Little Endian order,
or more generally in a different endian order of the host, the
instruction needs to be byte-swapped before being emulated.

This patch adds a helper routine which tests the endian order of
the host and the guest in order to decide whether a byteswap is
needed or not. It is then used to byteswap the last instruction
of the guest in the endian order of the host before MMIO emulation
is performed.

Finally, kvmppc_handle_load() of kvmppc_handle_store() are modified
to reverse the endianness of the MMIO if required.

Signed-off-by: Cédric Le Goater <clg@fr.ibm.com>
[agraf: add booke handling]
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h |  8 +++++++-
 arch/powerpc/include/asm/kvm_booke.h  |  6 ++++++
 arch/powerpc/include/asm/kvm_ppc.h    |  7 ++++---
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |  2 +-
 arch/powerpc/kvm/emulate.c            |  1 -
 arch/powerpc/kvm/powerpc.c            | 28 ++++++++++++++++++++++++----
 6 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index f8b23201c105..1e9c26f45d18 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -264,6 +264,11 @@ static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu)
 	return vcpu->arch.pc;
 }
 
+static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.shared->msr & MSR_LE) != (MSR_KERNEL & MSR_LE);
+}
+
 static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong pc)
 {
 	/* Load the instruction manually if it failed to do so in the
@@ -271,7 +276,8 @@ static inline u32 kvmppc_get_last_inst_internal(struct kvm_vcpu *vcpu, ulong pc)
 	if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED)
 		kvmppc_ld(vcpu, &pc, sizeof(u32), &vcpu->arch.last_inst, false);
 
-	return vcpu->arch.last_inst;
+	return kvmppc_need_byteswap(vcpu) ? swab32(vcpu->arch.last_inst) :
+		vcpu->arch.last_inst;
 }
 
 static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index dd8f61510dfd..80d46b5a7efb 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -63,6 +63,12 @@ static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
 	return vcpu->arch.xer;
 }
 
+static inline bool kvmppc_need_byteswap(struct kvm_vcpu *vcpu)
+{
+	/* XXX Would need to check TLB entry */
+	return false;
+}
+
 static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.last_inst;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index c8317fbf92c4..629277df4798 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -54,12 +54,13 @@ extern void kvmppc_handler_highmem(void);
 extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
 extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
                               unsigned int rt, unsigned int bytes,
-                              int is_bigendian);
+			      int is_default_endian);
 extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
                                unsigned int rt, unsigned int bytes,
-                               int is_bigendian);
+			       int is_default_endian);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                               u64 val, unsigned int bytes, int is_bigendian);
+			       u64 val, unsigned int bytes,
+			       int is_default_endian);
 
 extern int kvmppc_emulate_instruction(struct kvm_run *run,
                                       struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index f3ff587a8b7d..efb8aa544876 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -558,7 +558,7 @@ static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * we just return and retry the instruction.
 	 */
 
-	if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
+	if (instruction_is_store(kvmppc_get_last_inst(vcpu)) != !!is_store)
 		return RESUME_GUEST;
 
 	/*
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 2f9a0873b44f..c2b887be2c29 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -219,7 +219,6 @@ static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
  * lmw
  * stmw
  *
- * XXX is_bigendian should depend on MMU mapping or MSR[LE]
  */
 /* XXX Should probably auto-generate instruction decoding for a particular core
  * from opcode tables in the future. */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 7ca9e0a80499..026dfaaa4772 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -673,9 +673,19 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 }
 
 int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                       unsigned int rt, unsigned int bytes, int is_bigendian)
+		       unsigned int rt, unsigned int bytes,
+		       int is_default_endian)
 {
 	int idx, ret;
+	int is_bigendian;
+
+	if (kvmppc_need_byteswap(vcpu)) {
+		/* Default endianness is "little endian". */
+		is_bigendian = !is_default_endian;
+	} else {
+		/* Default endianness is "big endian". */
+		is_bigendian = is_default_endian;
+	}
 
 	if (bytes > sizeof(run->mmio.data)) {
 		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
@@ -711,21 +721,31 @@ EXPORT_SYMBOL_GPL(kvmppc_handle_load);
 
 /* Same as above, but sign extends */
 int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                        unsigned int rt, unsigned int bytes, int is_bigendian)
+			unsigned int rt, unsigned int bytes,
+			int is_default_endian)
 {
 	int r;
 
 	vcpu->arch.mmio_sign_extend = 1;
-	r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
+	r = kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian);
 
 	return r;
 }
 
 int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
-                        u64 val, unsigned int bytes, int is_bigendian)
+			u64 val, unsigned int bytes, int is_default_endian)
 {
 	void *data = run->mmio.data;
 	int idx, ret;
+	int is_bigendian;
+
+	if (kvmppc_need_byteswap(vcpu)) {
+		/* Default endianness is "little endian". */
+		is_bigendian = !is_default_endian;
+	} else {
+		/* Default endianness is "big endian". */
+		is_bigendian = is_default_endian;
+	}
 
 	if (bytes > sizeof(run->mmio.data)) {
 		printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__,
-- 
cgit v1.2.3


From 6c85f52b10fd60e45c6e30c5b85d116406bd3c9b Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Thu, 9 Jan 2014 19:18:40 -0600
Subject: kvm/ppc: IRQ disabling cleanup

Simplify the handling of lazy EE by going directly from fully-enabled
to hard-disabled.  This replaces the lazy_irq_pending() check
(including its misplaced kvm_guest_exit() call).

As suggested by Tiejun Chen, move the interrupt disabling into
kvmppc_prepare_to_enter() rather than have each caller do it.  Also
move the IRQ enabling on heavyweight exit into
kvmppc_prepare_to_enter().

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_ppc.h |  6 ++++++
 arch/powerpc/kvm/book3s_pr.c       | 14 ++++++--------
 arch/powerpc/kvm/booke.c           | 12 +++++-------
 arch/powerpc/kvm/powerpc.c         | 26 +++++++++-----------------
 4 files changed, 26 insertions(+), 32 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 629277df4798..fcd53f0d34ba 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -456,6 +456,12 @@ static inline void kvmppc_fix_ee_before_entry(void)
 	trace_hardirqs_on();
 
 #ifdef CONFIG_PPC64
+	/*
+	 * To avoid races, the caller must have gone directly from having
+	 * interrupts fully-enabled to hard-disabled.
+	 */
+	WARN_ON(local_paca->irq_happened != PACA_IRQ_HARD_DIS);
+
 	/* Only need to enable IRQs by hard enabling them after this */
 	local_paca->irq_happened = 0;
 	local_paca->soft_enabled = 1;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index aedba681bb94..e82fafdaf880 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -999,14 +999,14 @@ program_interrupt:
 		 * and if we really did time things so badly, then we just exit
 		 * again due to a host external interrupt.
 		 */
-		local_irq_disable();
 		s = kvmppc_prepare_to_enter(vcpu);
-		if (s <= 0) {
-			local_irq_enable();
+		if (s <= 0)
 			r = s;
-		} else {
+		else {
+			/* interrupts now hard-disabled */
 			kvmppc_fix_ee_before_entry();
 		}
+
 		kvmppc_handle_lost_ext(vcpu);
 	}
 
@@ -1219,12 +1219,10 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * really did time things so badly, then we just exit again due to
 	 * a host external interrupt.
 	 */
-	local_irq_disable();
 	ret = kvmppc_prepare_to_enter(vcpu);
-	if (ret <= 0) {
-		local_irq_enable();
+	if (ret <= 0)
 		goto out;
-	}
+	/* interrupts now hard-disabled */
 
 	/* Save FPU state in thread_struct */
 	if (current->thread.regs->msr & MSR_FP)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 6a8c32ec4173..07b89c711898 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -643,7 +643,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
 		local_irq_enable();
 		kvm_vcpu_block(vcpu);
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
-		local_irq_disable();
+		hard_irq_disable();
 
 		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
 		r = 1;
@@ -688,13 +688,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return -EINVAL;
 	}
 
-	local_irq_disable();
 	s = kvmppc_prepare_to_enter(vcpu);
 	if (s <= 0) {
-		local_irq_enable();
 		ret = s;
 		goto out;
 	}
+	/* interrupts now hard-disabled */
 
 #ifdef CONFIG_PPC_FPU
 	/* Save userspace FPU state in stack */
@@ -1187,12 +1186,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * aren't already exiting to userspace for some other reason.
 	 */
 	if (!(r & RESUME_HOST)) {
-		local_irq_disable();
 		s = kvmppc_prepare_to_enter(vcpu);
-		if (s <= 0) {
-			local_irq_enable();
+		if (s <= 0)
 			r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-		} else {
+		else {
+			/* interrupts now hard-disabled */
 			kvmppc_fix_ee_before_entry();
 		}
 	}
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 026dfaaa4772..3cf541a53e2a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -68,14 +68,16 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
  */
 int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 {
-	int r = 1;
+	int r;
+
+	WARN_ON(irqs_disabled());
+	hard_irq_disable();
 
-	WARN_ON_ONCE(!irqs_disabled());
 	while (true) {
 		if (need_resched()) {
 			local_irq_enable();
 			cond_resched();
-			local_irq_disable();
+			hard_irq_disable();
 			continue;
 		}
 
@@ -101,7 +103,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 			local_irq_enable();
 			trace_kvm_check_requests(vcpu);
 			r = kvmppc_core_check_requests(vcpu);
-			local_irq_disable();
+			hard_irq_disable();
 			if (r > 0)
 				continue;
 			break;
@@ -113,22 +115,12 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
 			continue;
 		}
 
-#ifdef CONFIG_PPC64
-		/* lazy EE magic */
-		hard_irq_disable();
-		if (lazy_irq_pending()) {
-			/* Got an interrupt in between, try again */
-			local_irq_enable();
-			local_irq_disable();
-			kvm_guest_exit();
-			continue;
-		}
-#endif
-
 		kvm_guest_enter();
-		break;
+		return 1;
 	}
 
+	/* return to host */
+	local_irq_enable();
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter);
-- 
cgit v1.2.3


From e0b7ec058c0eb7ba8d5d937d81de2bd16db6970e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Jan 2014 21:25:20 +1100
Subject: KVM: PPC: Book3S HV: Align physical and virtual CPU thread numbers

On a threaded processor such as POWER7, we group VCPUs into virtual
cores and arrange that the VCPUs in a virtual core run on the same
physical core.  Currently we don't enforce any correspondence between
virtual thread numbers within a virtual core and physical thread
numbers.  Physical threads are allocated starting at 0 on a first-come
first-served basis to runnable virtual threads (VCPUs).

POWER8 implements a new "msgsndp" instruction which guest kernels can
use to interrupt other threads in the same core or sub-core.  Since
the instruction takes the destination physical thread ID as a parameter,
it becomes necessary to align the physical thread IDs with the virtual
thread IDs, that is, to make sure virtual thread N within a virtual
core always runs on physical thread N.

This means that it's possible that thread 0, which is where we call
__kvmppc_vcore_entry, may end up running some other vcpu than the
one whose task called kvmppc_run_core(), or it may end up running
no vcpu at all, if for example thread 0 of the virtual core is
currently executing in userspace.  However, we do need thread 0
to be responsible for switching the MMU -- a previous version of
this patch that had other threads switching the MMU was found to
be responsible for occasional memory corruption and machine check
interrupts in the guest on POWER7 machines.

To accommodate this, we no longer pass the vcpu pointer to
__kvmppc_vcore_entry, but instead let the assembly code load it from
the PACA.  Since the assembly code will need to know the kvm pointer
and the thread ID for threads which don't have a vcpu, we move the
thread ID into the PACA and we add a kvm pointer to the virtual core
structure.

In the case where thread 0 has no vcpu to run, it still calls into
kvmppc_hv_entry in order to do the MMU switch, and then naps until
either its vcpu is ready to run in the guest, or some other thread
needs to exit the guest.  In the latter case, thread 0 jumps to the
code that switches the MMU back to the host.  This control flow means
that now we switch the MMU before loading any guest vcpu state.
Similarly, on guest exit we now save all the guest vcpu state before
switching the MMU back to the host.  This has required substantial
code movement, making the diff rather large.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |   1 +
 arch/powerpc/include/asm/kvm_host.h       |   2 +
 arch/powerpc/kernel/asm-offsets.c         |   3 +-
 arch/powerpc/kvm/book3s_hv.c              |  46 +-
 arch/powerpc/kvm/book3s_hv_interrupts.S   |   6 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 676 +++++++++++++++++-------------
 6 files changed, 397 insertions(+), 337 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 0bd9348a4db9..490b34f5d6bf 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -87,6 +87,7 @@ struct kvmppc_host_state {
 	u8 hwthread_req;
 	u8 hwthread_state;
 	u8 host_ipi;
+	u8 ptid;
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	unsigned long xics_phys;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2c2ca5faf7f2..b850544dbc3f 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -288,6 +288,7 @@ struct kvmppc_vcore {
 	int n_woken;
 	int nap_count;
 	int napping_threads;
+	int first_vcpuid;
 	u16 pcpu;
 	u16 last_cpu;
 	u8 vcore_state;
@@ -298,6 +299,7 @@ struct kvmppc_vcore {
 	u64 stolen_tb;
 	u64 preempt_tb;
 	struct kvm_vcpu *runner;
+	struct kvm *kvm;
 	u64 tb_offset;		/* guest timebase - host timebase */
 	ulong lpcr;
 	u32 arch_compat;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 5e64c3d2149f..332ae66883e4 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -506,7 +506,6 @@ int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
-	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
 	DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
 	DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
 	DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
@@ -514,6 +513,7 @@ int main(void)
 	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
+	DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
 	DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
 	DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
 	DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
@@ -583,6 +583,7 @@ int main(void)
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
+	HSTATE_FIELD(HSTATE_PTID, ptid);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7e1813ceabc1..7da53cd215db 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -990,6 +990,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 			init_waitqueue_head(&vcore->wq);
 			vcore->preempt_tb = TB_NIL;
 			vcore->lpcr = kvm->arch.lpcr;
+			vcore->first_vcpuid = core * threads_per_core;
+			vcore->kvm = kvm;
 		}
 		kvm->arch.vcores[core] = vcore;
 		kvm->arch.online_vcores++;
@@ -1003,6 +1005,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	++vcore->num_threads;
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
+	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -1066,7 +1069,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 	}
 }
 
-extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void __kvmppc_vcore_entry(void);
 
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 				   struct kvm_vcpu *vcpu)
@@ -1140,15 +1143,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
 	tpaca->kvm_hstate.kvm_vcore = vc;
-	tpaca->kvm_hstate.napping = 0;
+	tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
 	vcpu->cpu = vc->pcpu;
 	smp_wmb();
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-	if (vcpu->arch.ptid) {
+	if (cpu != smp_processor_id()) {
 #ifdef CONFIG_KVM_XICS
 		xics_wake_cpu(cpu);
 #endif
-		++vc->n_woken;
+		if (vcpu->arch.ptid)
+			++vc->n_woken;
 	}
 #endif
 }
@@ -1205,10 +1209,10 @@ static int on_primary_thread(void)
  */
 static void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
+	struct kvm_vcpu *vcpu, *vnext;
 	long ret;
 	u64 now;
-	int ptid, i, need_vpa_update;
+	int i, need_vpa_update;
 	int srcu_idx;
 	struct kvm_vcpu *vcpus_to_update[threads_per_core];
 
@@ -1245,25 +1249,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 		spin_lock(&vc->lock);
 	}
 
-	/*
-	 * Assign physical thread IDs, first to non-ceded vcpus
-	 * and then to ceded ones.
-	 */
-	ptid = 0;
-	vcpu0 = NULL;
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		if (!vcpu->arch.ceded) {
-			if (!ptid)
-				vcpu0 = vcpu;
-			vcpu->arch.ptid = ptid++;
-		}
-	}
-	if (!vcpu0)
-		goto out;	/* nothing to run; should never happen */
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-		if (vcpu->arch.ceded)
-			vcpu->arch.ptid = ptid++;
-
 	/*
 	 * Make sure we are running on thread 0, and that
 	 * secondary threads are offline.
@@ -1280,15 +1265,19 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 		kvmppc_create_dtl_entry(vcpu, vc);
 	}
 
+	/* Set this explicitly in case thread 0 doesn't have a vcpu */
+	get_paca()->kvm_hstate.kvm_vcore = vc;
+	get_paca()->kvm_hstate.ptid = 0;
+
 	vc->vcore_state = VCORE_RUNNING;
 	preempt_disable();
 	spin_unlock(&vc->lock);
 
 	kvm_guest_enter();
 
-	srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
+	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
 
-	__kvmppc_vcore_entry(NULL, vcpu0);
+	__kvmppc_vcore_entry();
 
 	spin_lock(&vc->lock);
 	/* disable sending of IPIs on virtual external irqs */
@@ -1303,7 +1292,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->vcore_state = VCORE_EXITING;
 	spin_unlock(&vc->lock);
 
-	srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx);
+	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
 
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
@@ -1411,7 +1400,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	if (!signal_pending(current)) {
 		if (vc->vcore_state == VCORE_RUNNING &&
 		    VCORE_EXIT_COUNT(vc) == 0) {
-			vcpu->arch.ptid = vc->n_runnable - 1;
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 00b7ed41ea17..e873796b1a29 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -35,7 +35,7 @@
  ****************************************************************************/
 
 /* Registers:
- *  r4: vcpu pointer
+ *  none
  */
 _GLOBAL(__kvmppc_vcore_entry)
 
@@ -71,7 +71,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtmsrd  r10,1
 
 	/* Save host PMU registers */
-	/* R4 is live here (vcpu pointer) but not r3 or r5 */
 	li	r3, 1
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
@@ -136,16 +135,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * enters the guest with interrupts enabled.
 	 */
 BEGIN_FTR_SECTION
+	ld	r4, HSTATE_KVM_VCPU(r13)
 	ld	r0, VCPU_PENDING_EXC(r4)
 	li	r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
 	oris	r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
 	and.	r0, r0, r7
 	beq	32f
-	mr	r31, r4
 	lhz	r3, PACAPACAINDEX(r13)
 	bl	smp_send_reschedule
 	nop
-	mr	r4, r31
 32:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 #endif /* CONFIG_SMP */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 66db71c9156a..8bbe91bdb6da 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -33,6 +33,10 @@
 #error Need to fix lppaca and SLB shadow accesses in little endian mode
 #endif
 
+/* Values in HSTATE_NAPPING(r13) */
+#define NAPPING_CEDE	1
+#define NAPPING_NOVCPU	2
+
 /*
  * Call kvmppc_hv_entry in real mode.
  * Must be called with interrupts hard-disabled.
@@ -57,6 +61,7 @@ _GLOBAL(kvmppc_hv_entry_trampoline)
 	RFI
 
 kvmppc_call_hv_entry:
+	ld	r4, HSTATE_KVM_VCPU(r13)
 	bl	kvmppc_hv_entry
 
 	/* Back from guest - restore host state and return to caller */
@@ -73,15 +78,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	ld	r3,PACA_SPRG3(r13)
 	mtspr	SPRN_SPRG3,r3
 
-	/*
-	 * Reload DEC.  HDEC interrupts were disabled when
-	 * we reloaded the host's LPCR value.
-	 */
-	ld	r3, HSTATE_DECEXP(r13)
-	mftb	r4
-	subf	r4, r4, r3
-	mtspr	SPRN_DEC, r4
-
 	/* Reload the host's PMU registers */
 	ld	r3, PACALPPACAPTR(r13)	/* is the host using the PMU? */
 	lbz	r4, LPPACA_PMCINUSE(r3)
@@ -116,6 +112,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	isync
 23:
 
+	/*
+	 * Reload DEC.  HDEC interrupts were disabled when
+	 * we reloaded the host's LPCR value.
+	 */
+	ld	r3, HSTATE_DECEXP(r13)
+	mftb	r4
+	subf	r4, r4, r3
+	mtspr	SPRN_DEC, r4
+
 	/*
 	 * For external and machine check interrupts, we need
 	 * to call the Linux handler to process the interrupt.
@@ -156,15 +161,82 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 13:	b	machine_check_fwnmi
 
 
+kvmppc_primary_no_guest:
+	/* We handle this much like a ceded vcpu */
+	/* set our bit in napping_threads */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+1:	lwarx	r3, 0, r6
+	or	r3, r3, r0
+	stwcx.	r3, 0, r6
+	bne	1b
+	/* order napping_threads update vs testing entry_exit_count */
+	isync
+	li	r12, 0
+	lwz	r7, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r7, 0x100
+	bge	kvm_novcpu_exit	/* another thread already exiting */
+	li	r3, NAPPING_NOVCPU
+	stb	r3, HSTATE_NAPPING(r13)
+	li	r3, 1
+	stb	r3, HSTATE_HWTHREAD_REQ(r13)
+
+	b	kvm_do_nap
+
+kvm_novcpu_wakeup:
+	ld	r1, HSTATE_HOST_R1(r13)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r0, 0
+	stb	r0, HSTATE_NAPPING(r13)
+	stb	r0, HSTATE_HWTHREAD_REQ(r13)
+
+	/* see if any other thread is already exiting */
+	li	r12, 0
+	lwz	r0, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r0, 0x100
+	bge	kvm_novcpu_exit
+
+	/* clear our bit in napping_threads */
+	lbz	r7, HSTATE_PTID(r13)
+	li	r0, 1
+	sld	r0, r0, r7
+	addi	r6, r5, VCORE_NAPPING_THREADS
+4:	lwarx	r3, 0, r6
+	andc	r3, r3, r0
+	stwcx.	r3, 0, r6
+	bne	4b
+
+	/* Check the wake reason in SRR1 to see why we got here */
+	mfspr	r3, SPRN_SRR1
+	rlwinm	r3, r3, 44-31, 0x7	/* extract wake reason field */
+	cmpwi	r3, 4			/* was it an external interrupt? */
+	bne	kvm_novcpu_exit		/* if not, exit the guest */
+
+	/* extern interrupt - read and handle it */
+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
+	bl	kvmppc_read_intr
+	cmpdi	r3, 0
+	bge	kvm_novcpu_exit
+	li	r12, 0
+
+	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	bne	kvmppc_got_guest
+
+kvm_novcpu_exit:
+	b	hdec_soon
+
 /*
- * We come in here when wakened from nap mode on a secondary hw thread.
+ * We come in here when wakened from nap mode.
  * Relocation is off and most register values are lost.
  * r13 points to the PACA.
  */
 	.globl	kvm_start_guest
 kvm_start_guest:
-	ld	r1,PACAEMERGSP(r13)
-	subi	r1,r1,STACK_FRAME_OVERHEAD
 	ld	r2,PACATOC(r13)
 
 	li	r0,KVM_HWTHREAD_IN_KVM
@@ -176,8 +248,13 @@ kvm_start_guest:
 
 	/* were we napping due to cede? */
 	lbz	r0,HSTATE_NAPPING(r13)
-	cmpwi	r0,0
-	bne	kvm_end_cede
+	cmpwi	r0,NAPPING_CEDE
+	beq	kvm_end_cede
+	cmpwi	r0,NAPPING_NOVCPU
+	beq	kvm_novcpu_wakeup
+
+	ld	r1,PACAEMERGSP(r13)
+	subi	r1,r1,STACK_FRAME_OVERHEAD
 
 	/*
 	 * We weren't napping due to cede, so this must be a secondary
@@ -220,7 +297,13 @@ kvm_start_guest:
 	stw	r8,HSTATE_SAVED_XIRR(r13)
 	b	kvm_no_guest
 
-30:	bl	kvmppc_hv_entry
+30:
+	/* Set HSTATE_DSCR(r13) to something sensible */
+	LOAD_REG_ADDR(r6, dscr_default)
+	ld	r6, 0(r6)
+	std	r6, HSTATE_DSCR(r13)
+
+	bl	kvmppc_hv_entry
 
 	/* Back from the guest, go back to nap */
 	/* Clear our vcpu pointer so we don't come back in early */
@@ -252,6 +335,7 @@ kvm_start_guest:
 kvm_no_guest:
 	li	r0, KVM_HWTHREAD_IN_NAP
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
+kvm_do_nap:
 	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR
 	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
@@ -276,7 +360,7 @@ kvmppc_hv_entry:
 
 	/* Required state:
 	 *
-	 * R4 = vcpu pointer
+	 * R4 = vcpu pointer (or NULL)
 	 * MSR = ~IR|DR
 	 * R13 = PACA
 	 * R1 = host R1
@@ -286,124 +370,12 @@ kvmppc_hv_entry:
 	std	r0, PPC_LR_STKOFF(r1)
 	stdu	r1, -112(r1)
 
-BEGIN_FTR_SECTION
-	/* Set partition DABR */
-	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
-	li	r5,3
-	ld	r6,VCPU_DABR(r4)
-	mtspr	SPRN_DABRX,r5
-	mtspr	SPRN_DABR,r6
- BEGIN_FTR_SECTION_NESTED(89)
-	isync
- END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89)
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-
-	/* Load guest PMU registers */
-	/* R4 is live here (vcpu pointer) */
-	li	r3, 1
-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
-	isync
-	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
-	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
-	lwz	r6, VCPU_PMC + 8(r4)
-	lwz	r7, VCPU_PMC + 12(r4)
-	lwz	r8, VCPU_PMC + 16(r4)
-	lwz	r9, VCPU_PMC + 20(r4)
-BEGIN_FTR_SECTION
-	lwz	r10, VCPU_PMC + 24(r4)
-	lwz	r11, VCPU_PMC + 28(r4)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-	mtspr	SPRN_PMC1, r3
-	mtspr	SPRN_PMC2, r5
-	mtspr	SPRN_PMC3, r6
-	mtspr	SPRN_PMC4, r7
-	mtspr	SPRN_PMC5, r8
-	mtspr	SPRN_PMC6, r9
-BEGIN_FTR_SECTION
-	mtspr	SPRN_PMC7, r10
-	mtspr	SPRN_PMC8, r11
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-	ld	r3, VCPU_MMCR(r4)
-	ld	r5, VCPU_MMCR + 8(r4)
-	ld	r6, VCPU_MMCR + 16(r4)
-	ld	r7, VCPU_SIAR(r4)
-	ld	r8, VCPU_SDAR(r4)
-	mtspr	SPRN_MMCR1, r5
-	mtspr	SPRN_MMCRA, r6
-	mtspr	SPRN_SIAR, r7
-	mtspr	SPRN_SDAR, r8
-	mtspr	SPRN_MMCR0, r3
-	isync
-
-	/* Load up FP, VMX and VSX registers */
-	bl	kvmppc_load_fp
-
-	ld	r14, VCPU_GPR(R14)(r4)
-	ld	r15, VCPU_GPR(R15)(r4)
-	ld	r16, VCPU_GPR(R16)(r4)
-	ld	r17, VCPU_GPR(R17)(r4)
-	ld	r18, VCPU_GPR(R18)(r4)
-	ld	r19, VCPU_GPR(R19)(r4)
-	ld	r20, VCPU_GPR(R20)(r4)
-	ld	r21, VCPU_GPR(R21)(r4)
-	ld	r22, VCPU_GPR(R22)(r4)
-	ld	r23, VCPU_GPR(R23)(r4)
-	ld	r24, VCPU_GPR(R24)(r4)
-	ld	r25, VCPU_GPR(R25)(r4)
-	ld	r26, VCPU_GPR(R26)(r4)
-	ld	r27, VCPU_GPR(R27)(r4)
-	ld	r28, VCPU_GPR(R28)(r4)
-	ld	r29, VCPU_GPR(R29)(r4)
-	ld	r30, VCPU_GPR(R30)(r4)
-	ld	r31, VCPU_GPR(R31)(r4)
-
-BEGIN_FTR_SECTION
-	/* Switch DSCR to guest value */
-	ld	r5, VCPU_DSCR(r4)
-	mtspr	SPRN_DSCR, r5
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-
-	/*
-	 * Set the decrementer to the guest decrementer.
-	 */
-	ld	r8,VCPU_DEC_EXPIRES(r4)
-	mftb	r7
-	subf	r3,r7,r8
-	mtspr	SPRN_DEC,r3
-	stw	r3,VCPU_DEC(r4)
-
-	ld	r5, VCPU_SPRG0(r4)
-	ld	r6, VCPU_SPRG1(r4)
-	ld	r7, VCPU_SPRG2(r4)
-	ld	r8, VCPU_SPRG3(r4)
-	mtspr	SPRN_SPRG0, r5
-	mtspr	SPRN_SPRG1, r6
-	mtspr	SPRN_SPRG2, r7
-	mtspr	SPRN_SPRG3, r8
-
 	/* Save R1 in the PACA */
 	std	r1, HSTATE_HOST_R1(r13)
 
-	/* Load up DAR and DSISR */
-	ld	r5, VCPU_DAR(r4)
-	lwz	r6, VCPU_DSISR(r4)
-	mtspr	SPRN_DAR, r5
-	mtspr	SPRN_DSISR, r6
-
 	li	r6, KVM_GUEST_MODE_HOST_HV
 	stb	r6, HSTATE_IN_GUEST(r13)
 
-BEGIN_FTR_SECTION
-	/* Restore AMR and UAMOR, set AMOR to all 1s */
-	ld	r5,VCPU_AMR(r4)
-	ld	r6,VCPU_UAMOR(r4)
-	li	r7,-1
-	mtspr	SPRN_AMR,r5
-	mtspr	SPRN_UAMOR,r6
-	mtspr	SPRN_AMOR,r7
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-
 	/* Clear out SLB */
 	li	r6,0
 	slbmte	r6,r6
@@ -429,8 +401,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
-	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
-	lwz	r6,VCPU_PTID(r4)
+	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
+	lbz	r6,HSTATE_PTID(r13)
 	cmpwi	r6,0
 	bne	20f
 	ld	r6,KVM_SDR1(r9)
@@ -504,32 +476,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_RMOR,r8
 	isync
 
-	/* Increment yield count if they have a VPA */
-	ld	r3, VCPU_VPA(r4)
-	cmpdi	r3, 0
-	beq	25f
-	lwz	r5, LPPACA_YIELDCOUNT(r3)
-	addi	r5, r5, 1
-	stw	r5, LPPACA_YIELDCOUNT(r3)
-	li	r6, 1
-	stb	r6, VCPU_VPA_DIRTY(r4)
-25:
 	/* Check if HDEC expires soon */
 	mfspr	r3,SPRN_HDEC
-	cmpwi	r3,10
+	cmpwi	r3,512		/* 1 microsecond */
 	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	mr	r9,r4
 	blt	hdec_soon
-
-	/* Save purr/spurr */
-	mfspr	r5,SPRN_PURR
-	mfspr	r6,SPRN_SPURR
-	std	r5,HSTATE_PURR(r13)
-	std	r6,HSTATE_SPURR(r13)
-	ld	r7,VCPU_PURR(r4)
-	ld	r8,VCPU_SPURR(r4)
-	mtspr	SPRN_PURR,r7
-	mtspr	SPRN_SPURR,r8
 	b	31f
 
 	/*
@@ -540,7 +491,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * We also have to invalidate the TLB since its
 	 * entries aren't tagged with the LPID.
 	 */
-30:	ld	r9,VCPU_KVM(r4)		/* pointer to struct kvm */
+30:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
 
 	/* first take native_tlbie_lock */
 	.section ".toc","aw"
@@ -605,7 +557,6 @@ toc_tlbie_lock:
 	mfspr	r3,SPRN_HDEC
 	cmpwi	r3,10
 	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	mr	r9,r4
 	blt	hdec_soon
 
 	/* Enable HDEC interrupts */
@@ -620,9 +571,14 @@ toc_tlbie_lock:
 	mfspr	r0,SPRN_HID0
 	mfspr	r0,SPRN_HID0
 	mfspr	r0,SPRN_HID0
+31:
+	/* Do we have a guest vcpu to run? */
+	cmpdi	r4, 0
+	beq	kvmppc_primary_no_guest
+kvmppc_got_guest:
 
 	/* Load up guest SLB entries */
-31:	lwz	r5,VCPU_SLB_MAX(r4)
+	lwz	r5,VCPU_SLB_MAX(r4)
 	cmpwi	r5,0
 	beq	9f
 	mtctr	r5
@@ -633,6 +589,140 @@ toc_tlbie_lock:
 	addi	r6,r6,VCPU_SLB_SIZE
 	bdnz	1b
 9:
+	/* Increment yield count if they have a VPA */
+	ld	r3, VCPU_VPA(r4)
+	cmpdi	r3, 0
+	beq	25f
+	lwz	r5, LPPACA_YIELDCOUNT(r3)
+	addi	r5, r5, 1
+	stw	r5, LPPACA_YIELDCOUNT(r3)
+	li	r6, 1
+	stb	r6, VCPU_VPA_DIRTY(r4)
+25:
+
+BEGIN_FTR_SECTION
+	/* Save purr/spurr */
+	mfspr	r5,SPRN_PURR
+	mfspr	r6,SPRN_SPURR
+	std	r5,HSTATE_PURR(r13)
+	std	r6,HSTATE_SPURR(r13)
+	ld	r7,VCPU_PURR(r4)
+	ld	r8,VCPU_SPURR(r4)
+	mtspr	SPRN_PURR,r7
+	mtspr	SPRN_SPURR,r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+BEGIN_FTR_SECTION
+	/* Set partition DABR */
+	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
+	li	r5,3
+	ld	r6,VCPU_DABR(r4)
+	mtspr	SPRN_DABRX,r5
+	mtspr	SPRN_DABR,r6
+ BEGIN_FTR_SECTION_NESTED(89)
+	isync
+ END_FTR_SECTION_NESTED(CPU_FTR_ARCH_206, CPU_FTR_ARCH_206, 89)
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+
+	/* Load guest PMU registers */
+	/* R4 is live here (vcpu pointer) */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	isync
+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
+	lwz	r6, VCPU_PMC + 8(r4)
+	lwz	r7, VCPU_PMC + 12(r4)
+	lwz	r8, VCPU_PMC + 16(r4)
+	lwz	r9, VCPU_PMC + 20(r4)
+BEGIN_FTR_SECTION
+	lwz	r10, VCPU_PMC + 24(r4)
+	lwz	r11, VCPU_PMC + 28(r4)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	mtspr	SPRN_PMC1, r3
+	mtspr	SPRN_PMC2, r5
+	mtspr	SPRN_PMC3, r6
+	mtspr	SPRN_PMC4, r7
+	mtspr	SPRN_PMC5, r8
+	mtspr	SPRN_PMC6, r9
+BEGIN_FTR_SECTION
+	mtspr	SPRN_PMC7, r10
+	mtspr	SPRN_PMC8, r11
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	ld	r3, VCPU_MMCR(r4)
+	ld	r5, VCPU_MMCR + 8(r4)
+	ld	r6, VCPU_MMCR + 16(r4)
+	ld	r7, VCPU_SIAR(r4)
+	ld	r8, VCPU_SDAR(r4)
+	mtspr	SPRN_MMCR1, r5
+	mtspr	SPRN_MMCRA, r6
+	mtspr	SPRN_SIAR, r7
+	mtspr	SPRN_SDAR, r8
+	mtspr	SPRN_MMCR0, r3
+	isync
+
+	/* Load up FP, VMX and VSX registers */
+	bl	kvmppc_load_fp
+
+	ld	r14, VCPU_GPR(R14)(r4)
+	ld	r15, VCPU_GPR(R15)(r4)
+	ld	r16, VCPU_GPR(R16)(r4)
+	ld	r17, VCPU_GPR(R17)(r4)
+	ld	r18, VCPU_GPR(R18)(r4)
+	ld	r19, VCPU_GPR(R19)(r4)
+	ld	r20, VCPU_GPR(R20)(r4)
+	ld	r21, VCPU_GPR(R21)(r4)
+	ld	r22, VCPU_GPR(R22)(r4)
+	ld	r23, VCPU_GPR(R23)(r4)
+	ld	r24, VCPU_GPR(R24)(r4)
+	ld	r25, VCPU_GPR(R25)(r4)
+	ld	r26, VCPU_GPR(R26)(r4)
+	ld	r27, VCPU_GPR(R27)(r4)
+	ld	r28, VCPU_GPR(R28)(r4)
+	ld	r29, VCPU_GPR(R29)(r4)
+	ld	r30, VCPU_GPR(R30)(r4)
+	ld	r31, VCPU_GPR(R31)(r4)
+
+BEGIN_FTR_SECTION
+	/* Switch DSCR to guest value */
+	ld	r5, VCPU_DSCR(r4)
+	mtspr	SPRN_DSCR, r5
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+	/*
+	 * Set the decrementer to the guest decrementer.
+	 */
+	ld	r8,VCPU_DEC_EXPIRES(r4)
+	mftb	r7
+	subf	r3,r7,r8
+	mtspr	SPRN_DEC,r3
+	stw	r3,VCPU_DEC(r4)
+
+	ld	r5, VCPU_SPRG0(r4)
+	ld	r6, VCPU_SPRG1(r4)
+	ld	r7, VCPU_SPRG2(r4)
+	ld	r8, VCPU_SPRG3(r4)
+	mtspr	SPRN_SPRG0, r5
+	mtspr	SPRN_SPRG1, r6
+	mtspr	SPRN_SPRG2, r7
+	mtspr	SPRN_SPRG3, r8
+
+	/* Load up DAR and DSISR */
+	ld	r5, VCPU_DAR(r4)
+	lwz	r6, VCPU_DSISR(r4)
+	mtspr	SPRN_DAR, r5
+	mtspr	SPRN_DSISR, r6
+
+BEGIN_FTR_SECTION
+	/* Restore AMR and UAMOR, set AMOR to all 1s */
+	ld	r5,VCPU_AMR(r4)
+	ld	r6,VCPU_UAMOR(r4)
+	li	r7,-1
+	mtspr	SPRN_AMR,r5
+	mtspr	SPRN_UAMOR,r6
+	mtspr	SPRN_AMOR,r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
 	/* Restore state of CTRL run bit; assume 1 on entry */
 	lwz	r5,VCPU_CTRL(r4)
@@ -984,13 +1074,130 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_SPURR,r4
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
 
+	/* Save DEC */
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+	add	r5,r5,r6
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+	/* Save and reset AMR and UAMOR before turning on the MMU */
+BEGIN_FTR_SECTION
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	li	r6,0
+	mtspr	SPRN_AMR,r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+	/* Switch DSCR back to host value */
+BEGIN_FTR_SECTION
+	mfspr	r8, SPRN_DSCR
+	ld	r7, HSTATE_DSCR(r13)
+	std	r8, VCPU_DSCR(r9)
+	mtspr	SPRN_DSCR, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
+	/* Save non-volatile GPRs */
+	std	r14, VCPU_GPR(R14)(r9)
+	std	r15, VCPU_GPR(R15)(r9)
+	std	r16, VCPU_GPR(R16)(r9)
+	std	r17, VCPU_GPR(R17)(r9)
+	std	r18, VCPU_GPR(R18)(r9)
+	std	r19, VCPU_GPR(R19)(r9)
+	std	r20, VCPU_GPR(R20)(r9)
+	std	r21, VCPU_GPR(R21)(r9)
+	std	r22, VCPU_GPR(R22)(r9)
+	std	r23, VCPU_GPR(R23)(r9)
+	std	r24, VCPU_GPR(R24)(r9)
+	std	r25, VCPU_GPR(R25)(r9)
+	std	r26, VCPU_GPR(R26)(r9)
+	std	r27, VCPU_GPR(R27)(r9)
+	std	r28, VCPU_GPR(R28)(r9)
+	std	r29, VCPU_GPR(R29)(r9)
+	std	r30, VCPU_GPR(R30)(r9)
+	std	r31, VCPU_GPR(R31)(r9)
+
+	/* Save SPRGs */
+	mfspr	r3, SPRN_SPRG0
+	mfspr	r4, SPRN_SPRG1
+	mfspr	r5, SPRN_SPRG2
+	mfspr	r6, SPRN_SPRG3
+	std	r3, VCPU_SPRG0(r9)
+	std	r4, VCPU_SPRG1(r9)
+	std	r5, VCPU_SPRG2(r9)
+	std	r6, VCPU_SPRG3(r9)
+
+	/* save FP state */
+	mr	r3, r9
+	bl	kvmppc_save_fp
+
+	/* Increment yield count if they have a VPA */
+	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
+	cmpdi	r8, 0
+	beq	25f
+	lwz	r3, LPPACA_YIELDCOUNT(r8)
+	addi	r3, r3, 1
+	stw	r3, LPPACA_YIELDCOUNT(r8)
+	li	r3, 1
+	stb	r3, VCPU_VPA_DIRTY(r9)
+25:
+	/* Save PMU registers if requested */
+	/* r8 and cr0.eq are live here */
+	li	r3, 1
+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
+	mfspr	r6, SPRN_MMCRA
+BEGIN_FTR_SECTION
+	/* On P7, clear MMCRA in order to disable SDAR updates */
+	li	r7, 0
+	mtspr	SPRN_MMCRA, r7
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+	isync
+	beq	21f			/* if no VPA, save PMU stuff anyway */
+	lbz	r7, LPPACA_PMCINUSE(r8)
+	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
+	bne	21f
+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
+	b	22f
+21:	mfspr	r5, SPRN_MMCR1
+	mfspr	r7, SPRN_SIAR
+	mfspr	r8, SPRN_SDAR
+	std	r4, VCPU_MMCR(r9)
+	std	r5, VCPU_MMCR + 8(r9)
+	std	r6, VCPU_MMCR + 16(r9)
+	std	r7, VCPU_SIAR(r9)
+	std	r8, VCPU_SDAR(r9)
+	mfspr	r3, SPRN_PMC1
+	mfspr	r4, SPRN_PMC2
+	mfspr	r5, SPRN_PMC3
+	mfspr	r6, SPRN_PMC4
+	mfspr	r7, SPRN_PMC5
+	mfspr	r8, SPRN_PMC6
+BEGIN_FTR_SECTION
+	mfspr	r10, SPRN_PMC7
+	mfspr	r11, SPRN_PMC8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+	stw	r3, VCPU_PMC(r9)
+	stw	r4, VCPU_PMC + 4(r9)
+	stw	r5, VCPU_PMC + 8(r9)
+	stw	r6, VCPU_PMC + 12(r9)
+	stw	r7, VCPU_PMC + 16(r9)
+	stw	r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+	stw	r10, VCPU_PMC + 24(r9)
+	stw	r11, VCPU_PMC + 28(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+22:
 	/* Clear out SLB */
 	li	r5,0
 	slbmte	r5,r5
 	slbia
 	ptesync
 
-hdec_soon:			/* r9 = vcpu, r12 = trap, r13 = paca */
+hdec_soon:			/* r12 = trap, r13 = paca */
 BEGIN_FTR_SECTION
 	b	32f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
@@ -1024,8 +1231,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 */
 	cmpwi	r3,0x100	/* Are we the first here? */
 	bge	43f
-	cmpwi	r3,1		/* Are any other threads in the guest? */
-	ble	43f
 	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
 	beq	40f
 	li	r0,0
@@ -1036,7 +1241,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * doesn't wake CPUs up from nap.
 	 */
 	lwz	r3,VCORE_NAPPING_THREADS(r5)
-	lwz	r4,VCPU_PTID(r9)
+	lbz	r4,HSTATE_PTID(r13)
 	li	r0,1
 	sld	r0,r0,r4
 	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */
@@ -1053,10 +1258,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	addi	r6,r6,PACA_SIZE
 	bne	42b
 
+secondary_too_late:
 	/* Secondary threads wait for primary to do partition switch */
-43:	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	lwz	r3,VCPU_PTID(r9)
+43:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
+	lbz	r3,HSTATE_PTID(r13)
 	cmpwi	r3,0
 	beq	15f
 	HMT_LOW
@@ -1121,7 +1327,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * We have to lock against concurrent tlbies, and
 	 * we have to flush the whole TLB.
 	 */
-32:	ld	r4,VCPU_KVM(r9)		/* pointer to struct kvm */
+32:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
 
 	/* Take the guest's tlbie_lock */
 #ifdef __BIG_ENDIAN__
@@ -1204,151 +1411,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 1:	addi	r8,r8,16
 	.endr
 
-	/* Save DEC */
-	mfspr	r5,SPRN_DEC
-	mftb	r6
-	extsw	r5,r5
-	add	r5,r5,r6
-	std	r5,VCPU_DEC_EXPIRES(r9)
-
-	/* Save and reset AMR and UAMOR before turning on the MMU */
-BEGIN_FTR_SECTION
-	mfspr	r5,SPRN_AMR
-	mfspr	r6,SPRN_UAMOR
-	std	r5,VCPU_AMR(r9)
-	std	r6,VCPU_UAMOR(r9)
-	li	r6,0
-	mtspr	SPRN_AMR,r6
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-
 	/* Unset guest mode */
 	li	r0, KVM_GUEST_MODE_NONE
 	stb	r0, HSTATE_IN_GUEST(r13)
 
-	/* Switch DSCR back to host value */
-BEGIN_FTR_SECTION
-	mfspr	r8, SPRN_DSCR
-	ld	r7, HSTATE_DSCR(r13)
-	std	r8, VCPU_DSCR(r9)
-	mtspr	SPRN_DSCR, r7
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-
-	/* Save non-volatile GPRs */
-	std	r14, VCPU_GPR(R14)(r9)
-	std	r15, VCPU_GPR(R15)(r9)
-	std	r16, VCPU_GPR(R16)(r9)
-	std	r17, VCPU_GPR(R17)(r9)
-	std	r18, VCPU_GPR(R18)(r9)
-	std	r19, VCPU_GPR(R19)(r9)
-	std	r20, VCPU_GPR(R20)(r9)
-	std	r21, VCPU_GPR(R21)(r9)
-	std	r22, VCPU_GPR(R22)(r9)
-	std	r23, VCPU_GPR(R23)(r9)
-	std	r24, VCPU_GPR(R24)(r9)
-	std	r25, VCPU_GPR(R25)(r9)
-	std	r26, VCPU_GPR(R26)(r9)
-	std	r27, VCPU_GPR(R27)(r9)
-	std	r28, VCPU_GPR(R28)(r9)
-	std	r29, VCPU_GPR(R29)(r9)
-	std	r30, VCPU_GPR(R30)(r9)
-	std	r31, VCPU_GPR(R31)(r9)
-
-	/* Save SPRGs */
-	mfspr	r3, SPRN_SPRG0
-	mfspr	r4, SPRN_SPRG1
-	mfspr	r5, SPRN_SPRG2
-	mfspr	r6, SPRN_SPRG3
-	std	r3, VCPU_SPRG0(r9)
-	std	r4, VCPU_SPRG1(r9)
-	std	r5, VCPU_SPRG2(r9)
-	std	r6, VCPU_SPRG3(r9)
-
-	/* save FP state */
-	mr	r3, r9
-	bl	kvmppc_save_fp
-
-	/* Increment yield count if they have a VPA */
-	ld	r8, VCPU_VPA(r9)	/* do they have a VPA? */
-	cmpdi	r8, 0
-	beq	25f
-	lwz	r3, LPPACA_YIELDCOUNT(r8)
-	addi	r3, r3, 1
-	stw	r3, LPPACA_YIELDCOUNT(r8)
-	li	r3, 1
-	stb	r3, VCPU_VPA_DIRTY(r9)
-25:
-	/* Save PMU registers if requested */
-	/* r8 and cr0.eq are live here */
-	li	r3, 1
-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
-	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
-	mfspr	r6, SPRN_MMCRA
-BEGIN_FTR_SECTION
-	/* On P7, clear MMCRA in order to disable SDAR updates */
-	li	r7, 0
-	mtspr	SPRN_MMCRA, r7
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-	isync
-	beq	21f			/* if no VPA, save PMU stuff anyway */
-	lbz	r7, LPPACA_PMCINUSE(r8)
-	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
-	bne	21f
-	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
-	b	22f
-21:	mfspr	r5, SPRN_MMCR1
-	mfspr	r7, SPRN_SIAR
-	mfspr	r8, SPRN_SDAR
-	std	r4, VCPU_MMCR(r9)
-	std	r5, VCPU_MMCR + 8(r9)
-	std	r6, VCPU_MMCR + 16(r9)
-	std	r7, VCPU_SIAR(r9)
-	std	r8, VCPU_SDAR(r9)
-	mfspr	r3, SPRN_PMC1
-	mfspr	r4, SPRN_PMC2
-	mfspr	r5, SPRN_PMC3
-	mfspr	r6, SPRN_PMC4
-	mfspr	r7, SPRN_PMC5
-	mfspr	r8, SPRN_PMC6
-BEGIN_FTR_SECTION
-	mfspr	r10, SPRN_PMC7
-	mfspr	r11, SPRN_PMC8
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-	stw	r3, VCPU_PMC(r9)
-	stw	r4, VCPU_PMC + 4(r9)
-	stw	r5, VCPU_PMC + 8(r9)
-	stw	r6, VCPU_PMC + 12(r9)
-	stw	r7, VCPU_PMC + 16(r9)
-	stw	r8, VCPU_PMC + 20(r9)
-BEGIN_FTR_SECTION
-	stw	r10, VCPU_PMC + 24(r9)
-	stw	r11, VCPU_PMC + 28(r9)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
-22:
 	ld	r0, 112+PPC_LR_STKOFF(r1)
 	addi	r1, r1, 112
 	mtlr	r0
 	blr
-secondary_too_late:
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	HMT_LOW
-13:	lbz	r3,VCORE_IN_GUEST(r5)
-	cmpwi	r3,0
-	bne	13b
-	HMT_MEDIUM
-	li	r0, KVM_GUEST_MODE_NONE
-	stb	r0, HSTATE_IN_GUEST(r13)
-	ld	r11,PACA_SLBSHADOWPTR(r13)
-
-	.rept	SLB_NUM_BOLTED
-	ld	r5,SLBSHADOW_SAVEAREA(r11)
-	ld	r6,SLBSHADOW_SAVEAREA+8(r11)
-	andis.	r7,r5,SLB_ESID_V@h
-	beq	1f
-	slbmte	r6,r5
-1:	addi	r11,r11,16
-	.endr
-	b	22b
 
 /*
  * Check whether an HDSI is an HPTE not found fault or something else.
@@ -1649,7 +1719,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	 * up to the host.
 	 */
 	ld	r5,HSTATE_KVM_VCORE(r13)
-	lwz	r6,VCPU_PTID(r3)
+	lbz	r6,HSTATE_PTID(r13)
 	lwz	r8,VCORE_ENTRY_EXIT(r5)
 	clrldi	r8,r8,56
 	li	r0,1
@@ -1662,7 +1732,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	bge	kvm_cede_exit
 	stwcx.	r4,0,r6
 	bne	31b
-	li	r0,1
+	li	r0,NAPPING_CEDE
 	stb	r0,HSTATE_NAPPING(r13)
 	/* order napping_threads update vs testing entry_exit_count */
 	lwsync
@@ -1751,7 +1821,7 @@ kvm_end_cede:
 
 	/* clear our bit in vcore->napping_threads */
 33:	ld	r5,HSTATE_KVM_VCORE(r13)
-	lwz	r3,VCPU_PTID(r4)
+	lbz	r3,HSTATE_PTID(r13)
 	li	r0,1
 	sld	r0,r0,r3
 	addi	r6,r5,VCORE_NAPPING_THREADS
-- 
cgit v1.2.3


From b005255e12a311d2c87ea70a7c7b192b2187c22c Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 8 Jan 2014 21:25:21 +1100
Subject: KVM: PPC: Book3S HV: Context-switch new POWER8 SPRs

This adds fields to the struct kvm_vcpu_arch to store the new
guest-accessible SPRs on POWER8, adds code to the get/set_one_reg
functions to allow userspace to access this state, and adds code to
the guest entry and exit to context-switch these SPRs between host
and guest.

Note that DPDES (Directed Privileged Doorbell Exception State) is
shared between threads on a core; hence we store it in struct
kvmppc_vcore and have the master thread save and restore it.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h     |  25 +++++-
 arch/powerpc/include/asm/reg.h          |  17 ++++
 arch/powerpc/include/uapi/asm/kvm.h     |   1 +
 arch/powerpc/kernel/asm-offsets.c       |  23 +++++
 arch/powerpc/kvm/book3s_hv.c            | 153 +++++++++++++++++++++++++++++++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 145 ++++++++++++++++++++++++++++++
 6 files changed, 361 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index b850544dbc3f..81c92d1d7978 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -304,6 +304,7 @@ struct kvmppc_vcore {
 	ulong lpcr;
 	u32 arch_compat;
 	ulong pcr;
+	ulong dpdes;		/* doorbell state (POWER8) */
 };
 
 #define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
@@ -448,6 +449,7 @@ struct kvm_vcpu_arch {
 	ulong pc;
 	ulong ctr;
 	ulong lr;
+	ulong tar;
 
 	ulong xer;
 	u32 cr;
@@ -457,13 +459,32 @@ struct kvm_vcpu_arch {
 	ulong guest_owned_ext;
 	ulong purr;
 	ulong spurr;
+	ulong ic;
+	ulong vtb;
 	ulong dscr;
 	ulong amr;
 	ulong uamor;
+	ulong iamr;
 	u32 ctrl;
 	ulong dabr;
+	ulong dawr;
+	ulong dawrx;
+	ulong ciabr;
 	ulong cfar;
 	ulong ppr;
+	ulong pspb;
+	ulong fscr;
+	ulong tfhar;
+	ulong tfiar;
+	ulong texasr;
+	ulong ebbhr;
+	ulong ebbrr;
+	ulong bescr;
+	ulong csigr;
+	ulong tacr;
+	ulong tcscr;
+	ulong acop;
+	ulong wort;
 	ulong shadow_srr1;
 #endif
 	u32 vrsave; /* also USPRG0 */
@@ -498,10 +519,12 @@ struct kvm_vcpu_arch {
 	u32 ccr1;
 	u32 dbsr;
 
-	u64 mmcr[3];
+	u64 mmcr[5];
 	u32 pmc[8];
+	u32 spmc[2];
 	u64 siar;
 	u64 sdar;
+	u64 sier;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 5c45787d551e..2f41e6475648 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -223,6 +223,11 @@
 #define   CTRL_TE	0x00c00000	/* thread enable */
 #define   CTRL_RUNLATCH	0x1
 #define SPRN_DAWR	0xB4
+#define SPRN_CIABR	0xBB
+#define   CIABR_PRIV		0x3
+#define   CIABR_PRIV_USER	1
+#define   CIABR_PRIV_SUPER	2
+#define   CIABR_PRIV_HYPER	3
 #define SPRN_DAWRX	0xBC
 #define   DAWRX_USER	(1UL << 0)
 #define   DAWRX_KERNEL	(1UL << 1)
@@ -260,6 +265,8 @@
 #define SPRN_HRMOR	0x139	/* Real mode offset register */
 #define SPRN_HSRR0	0x13A	/* Hypervisor Save/Restore 0 */
 #define SPRN_HSRR1	0x13B	/* Hypervisor Save/Restore 1 */
+#define SPRN_IC		0x350	/* Virtual Instruction Count */
+#define SPRN_VTB	0x351	/* Virtual Time Base */
 /* HFSCR and FSCR bit numbers are the same */
 #define FSCR_TAR_LG	8	/* Enable Target Address Register */
 #define FSCR_EBB_LG	7	/* Enable Event Based Branching */
@@ -368,6 +375,8 @@
 #define DER_EBRKE	0x00000002	/* External Breakpoint Interrupt */
 #define DER_DPIE	0x00000001	/* Dev. Port Nonmaskable Request */
 #define SPRN_DMISS	0x3D0		/* Data TLB Miss Register */
+#define SPRN_DHDES	0x0B1		/* Directed Hyp. Doorbell Exc. State */
+#define SPRN_DPDES	0x0B0		/* Directed Priv. Doorbell Exc. State */
 #define SPRN_EAR	0x11A		/* External Address Register */
 #define SPRN_HASH1	0x3D2		/* Primary Hash Address Register */
 #define SPRN_HASH2	0x3D3		/* Secondary Hash Address Resgister */
@@ -427,6 +436,7 @@
 #define SPRN_IABR	0x3F2	/* Instruction Address Breakpoint Register */
 #define SPRN_IABR2	0x3FA		/* 83xx */
 #define SPRN_IBCR	0x135		/* 83xx Insn Breakpoint Control Reg */
+#define SPRN_IAMR	0x03D		/* Instr. Authority Mask Reg */
 #define SPRN_HID4	0x3F4		/* 970 HID4 */
 #define  HID4_LPES0	 (1ul << (63-0)) /* LPAR env. sel. bit 0 */
 #define	 HID4_RMLS2_SH	 (63 - 2)	/* Real mode limit bottom 2 bits */
@@ -541,6 +551,7 @@
 #define SPRN_PIR	0x3FF	/* Processor Identification Register */
 #endif
 #define SPRN_TIR	0x1BE	/* Thread Identification Register */
+#define SPRN_PSPB	0x09F	/* Problem State Priority Boost reg */
 #define SPRN_PTEHI	0x3D5	/* 981 7450 PTE HI word (S/W TLB load) */
 #define SPRN_PTELO	0x3D6	/* 982 7450 PTE LO word (S/W TLB load) */
 #define SPRN_PURR	0x135	/* Processor Utilization of Resources Reg */
@@ -682,6 +693,7 @@
 #define SPRN_EBBHR	804	/* Event based branch handler register */
 #define SPRN_EBBRR	805	/* Event based branch return register */
 #define SPRN_BESCR	806	/* Branch event status and control register */
+#define SPRN_WORT	895	/* Workload optimization register - thread */
 
 #define SPRN_PMC1	787
 #define SPRN_PMC2	788
@@ -698,6 +710,11 @@
 #define   SIER_SIHV		0x1000000	/* Sampled MSR_HV */
 #define   SIER_SIAR_VALID	0x0400000	/* SIAR contents valid */
 #define   SIER_SDAR_VALID	0x0200000	/* SDAR contents valid */
+#define SPRN_TACR	888
+#define SPRN_TCSCR	889
+#define SPRN_CSIGR	890
+#define SPRN_SPMC1	892
+#define SPRN_SPMC2	893
 
 /* When EBB is enabled, some of MMCR0/MMCR2/SIER are user accessible */
 #define MMCR0_USER_MASK	(MMCR0_FC | MMCR0_PMXE | MMCR0_PMAO)
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 6836ec79a830..a586fb9b77bd 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -545,6 +545,7 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_TCSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb1)
 #define KVM_REG_PPC_PID		(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb2)
 #define KVM_REG_PPC_ACOP	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb3)
+#define KVM_REG_PPC_WORT	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb4)
 
 #define KVM_REG_PPC_VRSAVE	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb4)
 #define KVM_REG_PPC_LPCR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb5)
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 332ae66883e4..043900abbbb0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -432,6 +432,7 @@ int main(void)
 	DEFINE(VCPU_XER, offsetof(struct kvm_vcpu, arch.xer));
 	DEFINE(VCPU_CTR, offsetof(struct kvm_vcpu, arch.ctr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
+	DEFINE(VCPU_TAR, offsetof(struct kvm_vcpu, arch.tar));
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
 	DEFINE(VCPU_PC, offsetof(struct kvm_vcpu, arch.pc));
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -484,11 +485,17 @@ int main(void)
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
 	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
+	DEFINE(VCPU_IC, offsetof(struct kvm_vcpu, arch.ic));
+	DEFINE(VCPU_VTB, offsetof(struct kvm_vcpu, arch.vtb));
 	DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr));
 	DEFINE(VCPU_AMR, offsetof(struct kvm_vcpu, arch.amr));
 	DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor));
+	DEFINE(VCPU_IAMR, offsetof(struct kvm_vcpu, arch.iamr));
 	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
 	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
+	DEFINE(VCPU_DAWR, offsetof(struct kvm_vcpu, arch.dawr));
+	DEFINE(VCPU_DAWRX, offsetof(struct kvm_vcpu, arch.dawrx));
+	DEFINE(VCPU_CIABR, offsetof(struct kvm_vcpu, arch.ciabr));
 	DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags));
 	DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec));
 	DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires));
@@ -497,8 +504,10 @@ int main(void)
 	DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded));
 	DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr));
 	DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc));
+	DEFINE(VCPU_SPMC, offsetof(struct kvm_vcpu, arch.spmc));
 	DEFINE(VCPU_SIAR, offsetof(struct kvm_vcpu, arch.siar));
 	DEFINE(VCPU_SDAR, offsetof(struct kvm_vcpu, arch.sdar));
+	DEFINE(VCPU_SIER, offsetof(struct kvm_vcpu, arch.sier));
 	DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
 	DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
 	DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
@@ -508,6 +517,19 @@ int main(void)
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
 	DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
 	DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
+	DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr));
+	DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb));
+	DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar));
+	DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar));
+	DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr));
+	DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr));
+	DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr));
+	DEFINE(VCPU_BESCR, offsetof(struct kvm_vcpu, arch.bescr));
+	DEFINE(VCPU_CSIGR, offsetof(struct kvm_vcpu, arch.csigr));
+	DEFINE(VCPU_TACR, offsetof(struct kvm_vcpu, arch.tacr));
+	DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr));
+	DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
+	DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
 	DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
 	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
 	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
@@ -517,6 +539,7 @@ int main(void)
 	DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
 	DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
 	DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
+	DEFINE(VCORE_DPDES, offsetof(struct kvmppc_vcore, dpdes));
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
 	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
 	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7da53cd215db..5b08ddf91d2d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -800,7 +800,7 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_UAMOR:
 		*val = get_reg_val(id, vcpu->arch.uamor);
 		break;
-	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
 		i = id - KVM_REG_PPC_MMCR0;
 		*val = get_reg_val(id, vcpu->arch.mmcr[i]);
 		break;
@@ -808,12 +808,85 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 		i = id - KVM_REG_PPC_PMC1;
 		*val = get_reg_val(id, vcpu->arch.pmc[i]);
 		break;
+	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+		i = id - KVM_REG_PPC_SPMC1;
+		*val = get_reg_val(id, vcpu->arch.spmc[i]);
+		break;
 	case KVM_REG_PPC_SIAR:
 		*val = get_reg_val(id, vcpu->arch.siar);
 		break;
 	case KVM_REG_PPC_SDAR:
 		*val = get_reg_val(id, vcpu->arch.sdar);
 		break;
+	case KVM_REG_PPC_SIER:
+		*val = get_reg_val(id, vcpu->arch.sier);
+		break;
+	case KVM_REG_PPC_IAMR:
+		*val = get_reg_val(id, vcpu->arch.iamr);
+		break;
+	case KVM_REG_PPC_TFHAR:
+		*val = get_reg_val(id, vcpu->arch.tfhar);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		*val = get_reg_val(id, vcpu->arch.tfiar);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		*val = get_reg_val(id, vcpu->arch.texasr);
+		break;
+	case KVM_REG_PPC_FSCR:
+		*val = get_reg_val(id, vcpu->arch.fscr);
+		break;
+	case KVM_REG_PPC_PSPB:
+		*val = get_reg_val(id, vcpu->arch.pspb);
+		break;
+	case KVM_REG_PPC_EBBHR:
+		*val = get_reg_val(id, vcpu->arch.ebbhr);
+		break;
+	case KVM_REG_PPC_EBBRR:
+		*val = get_reg_val(id, vcpu->arch.ebbrr);
+		break;
+	case KVM_REG_PPC_BESCR:
+		*val = get_reg_val(id, vcpu->arch.bescr);
+		break;
+	case KVM_REG_PPC_TAR:
+		*val = get_reg_val(id, vcpu->arch.tar);
+		break;
+	case KVM_REG_PPC_DPDES:
+		*val = get_reg_val(id, vcpu->arch.vcore->dpdes);
+		break;
+	case KVM_REG_PPC_DAWR:
+		*val = get_reg_val(id, vcpu->arch.dawr);
+		break;
+	case KVM_REG_PPC_DAWRX:
+		*val = get_reg_val(id, vcpu->arch.dawrx);
+		break;
+	case KVM_REG_PPC_CIABR:
+		*val = get_reg_val(id, vcpu->arch.ciabr);
+		break;
+	case KVM_REG_PPC_IC:
+		*val = get_reg_val(id, vcpu->arch.ic);
+		break;
+	case KVM_REG_PPC_VTB:
+		*val = get_reg_val(id, vcpu->arch.vtb);
+		break;
+	case KVM_REG_PPC_CSIGR:
+		*val = get_reg_val(id, vcpu->arch.csigr);
+		break;
+	case KVM_REG_PPC_TACR:
+		*val = get_reg_val(id, vcpu->arch.tacr);
+		break;
+	case KVM_REG_PPC_TCSCR:
+		*val = get_reg_val(id, vcpu->arch.tcscr);
+		break;
+	case KVM_REG_PPC_PID:
+		*val = get_reg_val(id, vcpu->arch.pid);
+		break;
+	case KVM_REG_PPC_ACOP:
+		*val = get_reg_val(id, vcpu->arch.acop);
+		break;
+	case KVM_REG_PPC_WORT:
+		*val = get_reg_val(id, vcpu->arch.wort);
+		break;
 	case KVM_REG_PPC_VPA_ADDR:
 		spin_lock(&vcpu->arch.vpa_update_lock);
 		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
@@ -882,7 +955,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_UAMOR:
 		vcpu->arch.uamor = set_reg_val(id, *val);
 		break;
-	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
+	case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
 		i = id - KVM_REG_PPC_MMCR0;
 		vcpu->arch.mmcr[i] = set_reg_val(id, *val);
 		break;
@@ -890,12 +963,88 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 		i = id - KVM_REG_PPC_PMC1;
 		vcpu->arch.pmc[i] = set_reg_val(id, *val);
 		break;
+	case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
+		i = id - KVM_REG_PPC_SPMC1;
+		vcpu->arch.spmc[i] = set_reg_val(id, *val);
+		break;
 	case KVM_REG_PPC_SIAR:
 		vcpu->arch.siar = set_reg_val(id, *val);
 		break;
 	case KVM_REG_PPC_SDAR:
 		vcpu->arch.sdar = set_reg_val(id, *val);
 		break;
+	case KVM_REG_PPC_SIER:
+		vcpu->arch.sier = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_IAMR:
+		vcpu->arch.iamr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TFHAR:
+		vcpu->arch.tfhar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TFIAR:
+		vcpu->arch.tfiar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TEXASR:
+		vcpu->arch.texasr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_FSCR:
+		vcpu->arch.fscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PSPB:
+		vcpu->arch.pspb = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_EBBHR:
+		vcpu->arch.ebbhr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_EBBRR:
+		vcpu->arch.ebbrr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_BESCR:
+		vcpu->arch.bescr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TAR:
+		vcpu->arch.tar = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DPDES:
+		vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DAWR:
+		vcpu->arch.dawr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_DAWRX:
+		vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
+		break;
+	case KVM_REG_PPC_CIABR:
+		vcpu->arch.ciabr = set_reg_val(id, *val);
+		/* Don't allow setting breakpoints in hypervisor code */
+		if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+			vcpu->arch.ciabr &= ~CIABR_PRIV;	/* disable */
+		break;
+	case KVM_REG_PPC_IC:
+		vcpu->arch.ic = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_VTB:
+		vcpu->arch.vtb = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_CSIGR:
+		vcpu->arch.csigr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TACR:
+		vcpu->arch.tacr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_TCSCR:
+		vcpu->arch.tcscr = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_PID:
+		vcpu->arch.pid = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_ACOP:
+		vcpu->arch.acop = set_reg_val(id, *val);
+		break;
+	case KVM_REG_PPC_WORT:
+		vcpu->arch.wort = set_reg_val(id, *val);
+		break;
 	case KVM_REG_PPC_VPA_ADDR:
 		addr = set_reg_val(id, *val);
 		r = -EINVAL;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 8bbe91bdb6da..691dd1ef555b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -460,6 +460,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	beq	38f
 	mtspr	SPRN_PCR, r7
 38:
+
+BEGIN_FTR_SECTION
+	/* DPDES is shared between threads */
+	ld	r8, VCORE_DPDES(r5)
+	mtspr	SPRN_DPDES, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
 	li	r0,1
 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
 	b	10f
@@ -659,6 +666,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	mtspr	SPRN_MMCRA, r6
 	mtspr	SPRN_SIAR, r7
 	mtspr	SPRN_SDAR, r8
+BEGIN_FTR_SECTION
+	ld	r5, VCPU_MMCR + 24(r4)
+	ld	r6, VCPU_SIER(r4)
+	lwz	r7, VCPU_PMC + 24(r4)
+	lwz	r8, VCPU_PMC + 28(r4)
+	ld	r9, VCPU_MMCR + 32(r4)
+	mtspr	SPRN_MMCR2, r5
+	mtspr	SPRN_SIER, r6
+	mtspr	SPRN_SPMC1, r7
+	mtspr	SPRN_SPMC2, r8
+	mtspr	SPRN_MMCRS, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_MMCR0, r3
 	isync
 
@@ -690,6 +709,61 @@ BEGIN_FTR_SECTION
 	mtspr	SPRN_DSCR, r5
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 
+BEGIN_FTR_SECTION
+	/* Skip next section on POWER7 or PPC970 */
+	b	8f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+	/* Load up POWER8-specific registers */
+	ld	r5, VCPU_IAMR(r4)
+	lwz	r6, VCPU_PSPB(r4)
+	ld	r7, VCPU_FSCR(r4)
+	mtspr	SPRN_IAMR, r5
+	mtspr	SPRN_PSPB, r6
+	mtspr	SPRN_FSCR, r7
+	ld	r5, VCPU_DAWR(r4)
+	ld	r6, VCPU_DAWRX(r4)
+	ld	r7, VCPU_CIABR(r4)
+	ld	r8, VCPU_TAR(r4)
+	mtspr	SPRN_DAWR, r5
+	mtspr	SPRN_DAWRX, r6
+	mtspr	SPRN_CIABR, r7
+	mtspr	SPRN_TAR, r8
+	ld	r5, VCPU_IC(r4)
+	ld	r6, VCPU_VTB(r4)
+	mtspr	SPRN_IC, r5
+	mtspr	SPRN_VTB, r6
+	ld	r5, VCPU_TFHAR(r4)
+	ld	r6, VCPU_TFIAR(r4)
+	ld	r7, VCPU_TEXASR(r4)
+	ld	r8, VCPU_EBBHR(r4)
+	mtspr	SPRN_TFHAR, r5
+	mtspr	SPRN_TFIAR, r6
+	mtspr	SPRN_TEXASR, r7
+	mtspr	SPRN_EBBHR, r8
+	ld	r5, VCPU_EBBRR(r4)
+	ld	r6, VCPU_BESCR(r4)
+	ld	r7, VCPU_CSIGR(r4)
+	ld	r8, VCPU_TACR(r4)
+	mtspr	SPRN_EBBRR, r5
+	mtspr	SPRN_BESCR, r6
+	mtspr	SPRN_CSIGR, r7
+	mtspr	SPRN_TACR, r8
+	ld	r5, VCPU_TCSCR(r4)
+	ld	r6, VCPU_ACOP(r4)
+	lwz	r7, VCPU_GUEST_PID(r4)
+	ld	r8, VCPU_WORT(r4)
+	mtspr	SPRN_TCSCR, r5
+	mtspr	SPRN_ACOP, r6
+	mtspr	SPRN_PID, r7
+	mtspr	SPRN_WORT, r8
+8:
+
 	/*
 	 * Set the decrementer to the guest decrementer.
 	 */
@@ -1081,6 +1155,54 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201)
 	add	r5,r5,r6
 	std	r5,VCPU_DEC_EXPIRES(r9)
 
+BEGIN_FTR_SECTION
+	b	8f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+	/* Save POWER8-specific registers */
+	mfspr	r5, SPRN_IAMR
+	mfspr	r6, SPRN_PSPB
+	mfspr	r7, SPRN_FSCR
+	std	r5, VCPU_IAMR(r9)
+	stw	r6, VCPU_PSPB(r9)
+	std	r7, VCPU_FSCR(r9)
+	mfspr	r5, SPRN_IC
+	mfspr	r6, SPRN_VTB
+	mfspr	r7, SPRN_TAR
+	std	r5, VCPU_IC(r9)
+	std	r6, VCPU_VTB(r9)
+	std	r7, VCPU_TAR(r9)
+	mfspr	r5, SPRN_TFHAR
+	mfspr	r6, SPRN_TFIAR
+	mfspr	r7, SPRN_TEXASR
+	mfspr	r8, SPRN_EBBHR
+	std	r5, VCPU_TFHAR(r9)
+	std	r6, VCPU_TFIAR(r9)
+	std	r7, VCPU_TEXASR(r9)
+	std	r8, VCPU_EBBHR(r9)
+	mfspr	r5, SPRN_EBBRR
+	mfspr	r6, SPRN_BESCR
+	mfspr	r7, SPRN_CSIGR
+	mfspr	r8, SPRN_TACR
+	std	r5, VCPU_EBBRR(r9)
+	std	r6, VCPU_BESCR(r9)
+	std	r7, VCPU_CSIGR(r9)
+	std	r8, VCPU_TACR(r9)
+	mfspr	r5, SPRN_TCSCR
+	mfspr	r6, SPRN_ACOP
+	mfspr	r7, SPRN_PID
+	mfspr	r8, SPRN_WORT
+	std	r5, VCPU_TCSCR(r9)
+	std	r6, VCPU_ACOP(r9)
+	stw	r7, VCPU_GUEST_PID(r9)
+	std	r8, VCPU_WORT(r9)
+8:
+
 	/* Save and reset AMR and UAMOR before turning on the MMU */
 BEGIN_FTR_SECTION
 	mfspr	r5,SPRN_AMR
@@ -1190,6 +1312,20 @@ BEGIN_FTR_SECTION
 	stw	r10, VCPU_PMC + 24(r9)
 	stw	r11, VCPU_PMC + 28(r9)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
+BEGIN_FTR_SECTION
+	mfspr	r4, SPRN_MMCR2
+	mfspr	r5, SPRN_SIER
+	mfspr	r6, SPRN_SPMC1
+	mfspr	r7, SPRN_SPMC2
+	mfspr	r8, SPRN_MMCRS
+	std	r4, VCPU_MMCR + 24(r9)
+	std	r5, VCPU_SIER(r9)
+	stw	r6, VCPU_PMC + 24(r9)
+	stw	r7, VCPU_PMC + 28(r9)
+	std	r8, VCPU_MMCR + 32(r9)
+	lis	r4, 0x8000
+	mtspr	SPRN_MMCRS, r4
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 22:
 	/* Clear out SLB */
 	li	r5,0
@@ -1290,6 +1426,15 @@ secondary_too_late:
 	mtspr	SPRN_LPID,r7
 	isync
 
+BEGIN_FTR_SECTION
+	/* DPDES is shared between threads */
+	mfspr	r7, SPRN_DPDES
+	std	r7, VCORE_DPDES(r5)
+	/* clear DPDES so we don't get guest doorbells in the host */
+	li	r8, 0
+	mtspr	SPRN_DPDES, r8
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+
 	/* Subtract timebase offset from timebase */
 	ld	r8,VCORE_TB_OFFSET(r5)
 	cmpdi	r8,0
-- 
cgit v1.2.3


From bd3048b80caace9cf0ae9ad22b2fbb8333b44a97 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Wed, 8 Jan 2014 21:25:23 +1100
Subject: KVM: PPC: Book3S HV: Add handler for HV facility unavailable

At present this should never happen, since the host kernel sets
HFSCR to allow access to all facilities.  It's better to be prepared
to handle it cleanly if it does ever happen, though.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_asm.h |  1 +
 arch/powerpc/kvm/book3s_hv.c       | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 1bd92fd43cfb..dba8fb244100 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -99,6 +99,7 @@
 #define BOOK3S_INTERRUPT_PERFMON	0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
 #define BOOK3S_INTERRUPT_VSX		0xf40
+#define BOOK3S_INTERRUPT_H_FAC_UNAVAIL	0xf80
 
 #define BOOK3S_IRQPRIO_SYSTEM_RESET		0
 #define BOOK3S_IRQPRIO_DATA_SEGMENT		1
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 5b08ddf91d2d..1bf681e8a05f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -706,7 +706,16 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * we don't emulate any guest instructions at this stage.
 	 */
 	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
-		kvmppc_core_queue_program(vcpu, 0x80000);
+		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+		r = RESUME_GUEST;
+		break;
+	/*
+	 * This occurs if the guest (kernel or userspace), does something that
+	 * is prohibited by HFSCR.  We just generate a program interrupt to
+	 * the guest.
+	 */
+	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+		kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 		r = RESUME_GUEST;
 		break;
 	default:
-- 
cgit v1.2.3


From 5557ae0ec77c2b4b5bbce2883c0603054ab66e68 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Jan 2014 21:25:24 +1100
Subject: KVM: PPC: Book3S HV: Implement architecture compatibility modes for
 POWER8

This allows us to select architecture 2.05 (POWER6) or 2.06 (POWER7)
compatibility modes on a POWER8 processor.  (Note that transactional
memory is disabled for usermode if either or both of the PCR_TM_DIS
and PCR_ARCH_206 bits are set.)

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/reg.h |  2 ++
 arch/powerpc/kvm/book3s_hv.c   | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 2f41e6475648..5a9983147683 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -329,6 +329,8 @@
 #define SPRN_PCR	0x152	/* Processor compatibility register */
 #define   PCR_VEC_DIS	(1ul << (63-0))	/* Vec. disable (bit NA since POWER8) */
 #define   PCR_VSX_DIS	(1ul << (63-1))	/* VSX disable (bit NA since POWER8) */
+#define   PCR_TM_DIS	(1ul << (63-2))	/* Trans. memory disable (POWER8) */
+#define   PCR_ARCH_206	0x4		/* Architecture 2.06 */
 #define   PCR_ARCH_205	0x2		/* Architecture 2.05 */
 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
 #define SPRN_TLBINDEXR	0x154	/* P7 TLB control register */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1bf681e8a05f..1e9f4b45432b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -184,14 +184,28 @@ int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 
 		switch (arch_compat) {
 		case PVR_ARCH_205:
-			pcr = PCR_ARCH_205;
+			/*
+			 * If an arch bit is set in PCR, all the defined
+			 * higher-order arch bits also have to be set.
+			 */
+			pcr = PCR_ARCH_206 | PCR_ARCH_205;
 			break;
 		case PVR_ARCH_206:
 		case PVR_ARCH_206p:
+			pcr = PCR_ARCH_206;
+			break;
+		case PVR_ARCH_207:
 			break;
 		default:
 			return -EINVAL;
 		}
+
+		if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
+			/* POWER7 can't emulate POWER8 */
+			if (!(pcr & PCR_ARCH_206))
+				return -EINVAL;
+			pcr &= ~PCR_ARCH_206;
+		}
 	}
 
 	spin_lock(&vc->lock);
-- 
cgit v1.2.3


From aa31e843225769735b79795c955426c9479046a5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Jan 2014 21:25:26 +1100
Subject: KVM: PPC: Book3S HV: Handle guest using doorbells for IPIs

* SRR1 wake reason field for system reset interrupt on wakeup from nap
  is now a 4-bit field on P8, compared to 3 bits on P7.

* Set PECEDP in LPCR when napping because of H_CEDE so guest doorbells
  will wake us up.

* Waking up from nap because of a guest doorbell interrupt is not a
  reason to exit the guest.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/reg.h          |  4 +++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 +++++++++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 5a9983147683..1248b40107ea 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -307,7 +307,9 @@
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
 #define   LPCR_AIL_0	0x00000000	/* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3	0x01800000	/* MMU on exception offset 0xc00...4xxx */
-#define   LPCR_PECE	0x00007000	/* powersave exit cause enable */
+#define   LPCR_PECE	0x0001f000	/* powersave exit cause enable */
+#define     LPCR_PECEDP	0x00010000	/* directed priv dbells cause exit */
+#define     LPCR_PECEDH	0x00008000	/* directed hyp dbells cause exit */
 #define     LPCR_PECE0	0x00004000	/* ext. exceptions can cause exit */
 #define     LPCR_PECE1	0x00002000	/* decrementer can cause exit */
 #define     LPCR_PECE2	0x00001000	/* machine check etc can cause exit */
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 386e141dbf16..9e89c7577b4a 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1857,13 +1857,16 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206)
 	bl	kvmppc_save_fp
 
 	/*
-	 * Take a nap until a decrementer or external interrupt occurs,
-	 * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR
+	 * Take a nap until a decrementer or external or doobell interrupt
+	 * occurs, with PECE1, PECE0 and PECEDP set in LPCR
 	 */
 	li	r0,1
 	stb	r0,HSTATE_HWTHREAD_REQ(r13)
 	mfspr	r5,SPRN_LPCR
 	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
+BEGIN_FTR_SECTION
+	oris	r5,r5,LPCR_PECEDP@h
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_LPCR,r5
 	isync
 	li	r0, 0
@@ -1979,14 +1982,22 @@ machine_check_realmode:
  */
 kvmppc_check_wake_reason:
 	mfspr	r6, SPRN_SRR1
-	rlwinm	r6, r6, 44-31, 0x7	/* extract wake reason field */
-	cmpwi	r6, 4			/* was it an external interrupt? */
+BEGIN_FTR_SECTION
+	rlwinm	r6, r6, 45-31, 0xf	/* extract wake reason field (P8) */
+FTR_SECTION_ELSE
+	rlwinm	r6, r6, 45-31, 0xe	/* P7 wake reason field is 3 bits */
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
+	cmpwi	r6, 8			/* was it an external interrupt? */
 	li	r12, BOOK3S_INTERRUPT_EXTERNAL
 	beq	kvmppc_read_intr	/* if so, see what it was */
 	li	r3, 0
 	li	r12, 0
 	cmpwi	r6, 6			/* was it the decrementer? */
 	beq	0f
+BEGIN_FTR_SECTION
+	cmpwi	r6, 5			/* privileged doorbell? */
+	beq	0f
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	li	r3, 1			/* anything else, return 1 */
 0:	blr
 
-- 
cgit v1.2.3


From e0622bd9f2fccc8a801fa7aaf4fa6d7c728c3a78 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Jan 2014 21:25:27 +1100
Subject: KVM: PPC: Book3S HV: Handle new LPCR bits on POWER8

POWER8 has a bit in the LPCR to enable or disable the PURR and SPURR
registers to count when in the guest.  Set this bit.

POWER8 has a field in the LPCR called AIL (Alternate Interrupt Location)
which is used to enable relocation-on interrupts.  Allow userspace to
set this field.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/reg.h | 2 ++
 arch/powerpc/kvm/book3s_hv.c   | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 1248b40107ea..05ecb072540c 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -305,8 +305,10 @@
 #define   LPCR_RMLS    0x1C000000      /* impl dependent rmo limit sel */
 #define	  LPCR_RMLS_SH	(63-37)
 #define   LPCR_ILE     0x02000000      /* !HV irqs set MSR:LE */
+#define   LPCR_AIL	0x01800000	/* Alternate interrupt location */
 #define   LPCR_AIL_0	0x00000000	/* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3	0x01800000	/* MMU on exception offset 0xc00...4xxx */
+#define   LPCR_ONL	0x00040000	/* online - PURR/SPURR count */
 #define   LPCR_PECE	0x0001f000	/* powersave exit cause enable */
 #define     LPCR_PECEDP	0x00010000	/* directed priv dbells cause exit */
 #define     LPCR_PECEDH	0x00008000	/* directed hyp dbells cause exit */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1e9f4b45432b..d7f2ec6f1419 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -789,8 +789,11 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr)
 	/*
 	 * Userspace can only modify DPFD (default prefetch depth),
 	 * ILE (interrupt little-endian) and TC (translation control).
+	 * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
 	 */
 	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
+	if (cpu_has_feature(CPU_FTR_ARCH_207S))
+		mask |= LPCR_AIL;
 	vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
 	spin_unlock(&vc->lock);
 }
@@ -2166,6 +2169,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 			LPCR_VPM0 | LPCR_VPM1;
 		kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
 			(VRMA_VSID << SLB_VSID_SHIFT_1T);
+		/* On POWER8 turn on online bit to enable PURR/SPURR */
+		if (cpu_has_feature(CPU_FTR_ARCH_207S))
+			lpcr |= LPCR_ONL;
 	}
 	kvm->arch.lpcr = lpcr;
 
-- 
cgit v1.2.3


From 5d00f66b865e3782c5852cdafe1cea11a292a81e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Jan 2014 21:25:28 +1100
Subject: KVM: PPC: Book3S HV: Prepare for host using hypervisor doorbells

POWER8 has support for hypervisor doorbell interrupts.  Though the
kernel doesn't use them for IPIs on the powernv platform yet, it
probably will in future, so this makes KVM cope gracefully if a
hypervisor doorbell interrupt arrives while in a guest.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_asm.h      | 1 +
 arch/powerpc/kvm/book3s_hv.c            | 1 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 7 +++++++
 3 files changed, 9 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index dba8fb244100..c3815b1e79e8 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -96,6 +96,7 @@
 #define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
 #define BOOK3S_INTERRUPT_H_INST_STORAGE	0xe20
 #define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
+#define BOOK3S_INTERRUPT_H_DOORBELL	0xe80
 #define BOOK3S_INTERRUPT_PERFMON	0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
 #define BOOK3S_INTERRUPT_VSX		0xf40
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d7f2ec6f1419..216049ff7368 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -651,6 +651,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 	case BOOK3S_INTERRUPT_EXTERNAL:
+	case BOOK3S_INTERRUPT_H_DOORBELL:
 		vcpu->stat.ext_intr_exits++;
 		r = RESUME_GUEST;
 		break;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9e89c7577b4a..eae4ab9b9135 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1997,10 +1997,17 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
 BEGIN_FTR_SECTION
 	cmpwi	r6, 5			/* privileged doorbell? */
 	beq	0f
+	cmpwi	r6, 3			/* hypervisor doorbell? */
+	beq	3f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	li	r3, 1			/* anything else, return 1 */
 0:	blr
 
+	/* hypervisor doorbell */
+3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	li	r3, 1
+	blr
+
 /*
  * Determine what sort of external interrupt is pending (if any).
  * Returns:
-- 
cgit v1.2.3


From 8563bf52d509213e746295341ab52896b562ca5e Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Jan 2014 21:25:29 +1100
Subject: KVM: PPC: Book3S HV: Add support for DABRX register on POWER7

The DABRX (DABR extension) register on POWER7 processors provides finer
control over which accesses cause a data breakpoint interrupt.  It
contains 3 bits which indicate whether to enable accesses in user,
kernel and hypervisor modes respectively to cause data breakpoint
interrupts, plus one bit that enables both real mode and virtual mode
accesses to cause interrupts.  Currently, KVM sets DABRX to allow
both kernel and user accesses to cause interrupts while in the guest.

This adds support for the guest to specify other values for DABRX.
PAPR defines a H_SET_XDABR hcall to allow the guest to set both DABR
and DABRX with one call.  This adds a real-mode implementation of
H_SET_XDABR, which shares most of its code with the existing H_SET_DABR
implementation.  To support this, we add a per-vcpu field to store the
DABRX value plus code to get and set it via the ONE_REG interface.

For Linux guests to use this new hcall, userspace needs to add
"hcall-xdabr" to the set of strings in the /chosen/hypertas-functions
property in the device tree.  If userspace does this and then migrates
the guest to a host where the kernel doesn't include this patch, then
userspace will need to implement H_SET_XDABR by writing the specified
DABR value to the DABR using the ONE_REG interface.  In that case, the
old kernel will set DABRX to DABRX_USER | DABRX_KERNEL.  That should
still work correctly, at least for Linux guests, since Linux guests
cope with getting data breakpoint interrupts in modes that weren't
requested by just ignoring the interrupt, and Linux guests never set
DABRX_BTI.

The other thing this does is to make H_SET_DABR and H_SET_XDABR work
on POWER8, which has the DAWR and DAWRX instead of DABR/X.  Guests that
know about POWER8 should use H_SET_MODE rather than H_SET_[X]DABR, but
guests running in POWER7 compatibility mode will still use H_SET_[X]DABR.
For them, this adds the logic to convert DABR/X values into DAWR/X values
on POWER8.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt       |  1 +
 arch/powerpc/include/asm/kvm_host.h     |  1 +
 arch/powerpc/include/asm/reg.h          | 18 +++++++++++-------
 arch/powerpc/include/uapi/asm/kvm.h     |  2 ++
 arch/powerpc/kernel/asm-offsets.c       |  1 +
 arch/powerpc/kvm/book3s_hv.c            |  6 ++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 32 ++++++++++++++++++++++++++++++--
 7 files changed, 52 insertions(+), 9 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a30035dd4c26..5056baf574f4 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1838,6 +1838,7 @@ registers, find a list below:
   PPC   | KVM_REG_PPC_LPCR	| 64
   PPC   | KVM_REG_PPC_PPR	| 64
   PPC   | KVM_REG_PPC_ARCH_COMPAT 32
+  PPC   | KVM_REG_PPC_DABRX     | 32
   PPC   | KVM_REG_PPC_TM_GPR0	| 64
           ...
   PPC   | KVM_REG_PPC_TM_GPR31	| 64
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 81c92d1d7978..d161bc09153b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -466,6 +466,7 @@ struct kvm_vcpu_arch {
 	ulong uamor;
 	ulong iamr;
 	u32 ctrl;
+	u32 dabrx;
 	ulong dabr;
 	ulong dawr;
 	ulong dawrx;
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 05ecb072540c..adf644a80a3e 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -229,16 +229,20 @@
 #define   CIABR_PRIV_SUPER	2
 #define   CIABR_PRIV_HYPER	3
 #define SPRN_DAWRX	0xBC
-#define   DAWRX_USER	(1UL << 0)
-#define   DAWRX_KERNEL	(1UL << 1)
-#define   DAWRX_HYP	(1UL << 2)
+#define   DAWRX_USER	__MASK(0)
+#define   DAWRX_KERNEL	__MASK(1)
+#define   DAWRX_HYP	__MASK(2)
+#define   DAWRX_WTI	__MASK(3)
+#define   DAWRX_WT	__MASK(4)
+#define   DAWRX_DR	__MASK(5)
+#define   DAWRX_DW	__MASK(6)
 #define SPRN_DABR	0x3F5	/* Data Address Breakpoint Register */
 #define SPRN_DABR2	0x13D	/* e300 */
 #define SPRN_DABRX	0x3F7	/* Data Address Breakpoint Register Extension */
-#define   DABRX_USER	(1UL << 0)
-#define   DABRX_KERNEL	(1UL << 1)
-#define   DABRX_HYP	(1UL << 2)
-#define   DABRX_BTI	(1UL << 3)
+#define   DABRX_USER	__MASK(0)
+#define   DABRX_KERNEL	__MASK(1)
+#define   DABRX_HYP	__MASK(2)
+#define   DABRX_BTI	__MASK(3)
 #define   DABRX_ALL     (DABRX_BTI | DABRX_HYP | DABRX_KERNEL | DABRX_USER)
 #define SPRN_DAR	0x013	/* Data Address Register */
 #define SPRN_DBCR	0x136	/* e300 Data Breakpoint Control Reg */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index a586fb9b77bd..a6665be4f3ab 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -554,6 +554,8 @@ struct kvm_get_htab_header {
 /* Architecture compatibility level */
 #define KVM_REG_PPC_ARCH_COMPAT	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb7)
 
+#define KVM_REG_PPC_DABRX	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xb8)
+
 /* Transactional Memory checkpointed state:
  * This is all GPRs, all VSX regs and a subset of SPRs
  */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 043900abbbb0..239a857f1141 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -493,6 +493,7 @@ int main(void)
 	DEFINE(VCPU_IAMR, offsetof(struct kvm_vcpu, arch.iamr));
 	DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl));
 	DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr));
+	DEFINE(VCPU_DABRX, offsetof(struct kvm_vcpu, arch.dabrx));
 	DEFINE(VCPU_DAWR, offsetof(struct kvm_vcpu, arch.dawr));
 	DEFINE(VCPU_DAWRX, offsetof(struct kvm_vcpu, arch.dawrx));
 	DEFINE(VCPU_CIABR, offsetof(struct kvm_vcpu, arch.ciabr));
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 216049ff7368..eb4eed4a7173 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -812,6 +812,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_DABR:
 		*val = get_reg_val(id, vcpu->arch.dabr);
 		break;
+	case KVM_REG_PPC_DABRX:
+		*val = get_reg_val(id, vcpu->arch.dabrx);
+		break;
 	case KVM_REG_PPC_DSCR:
 		*val = get_reg_val(id, vcpu->arch.dscr);
 		break;
@@ -967,6 +970,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_DABR:
 		vcpu->arch.dabr = set_reg_val(id, *val);
 		break;
+	case KVM_REG_PPC_DABRX:
+		vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
+		break;
 	case KVM_REG_PPC_DSCR:
 		vcpu->arch.dscr = set_reg_val(id, *val);
 		break;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index eae4ab9b9135..56299349e94b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -585,7 +585,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 BEGIN_FTR_SECTION
 	/* Set partition DABR */
 	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
-	li	r5,3
+	lwz	r5,VCPU_DABRX(r4)
 	ld	r6,VCPU_DABR(r4)
 	mtspr	SPRN_DABRX,r5
 	mtspr	SPRN_DABR,r6
@@ -1763,24 +1763,52 @@ hcall_real_table:
 	.long	0		/* 0x11c */
 	.long	0		/* 0x120 */
 	.long	.kvmppc_h_bulk_remove - hcall_real_table
+	.long	0		/* 0x128 */
+	.long	0		/* 0x12c */
+	.long	0		/* 0x130 */
+	.long	.kvmppc_h_set_xdabr - hcall_real_table
 hcall_real_table_end:
 
 ignore_hdec:
 	mr	r4,r9
 	b	fast_guest_return
 
+_GLOBAL(kvmppc_h_set_xdabr)
+	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
+	beq	6f
+	li	r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
+	andc.	r0, r5, r0
+	beq	3f
+6:	li	r3, H_PARAMETER
+	blr
+
 _GLOBAL(kvmppc_h_set_dabr)
+	li	r5, DABRX_USER | DABRX_KERNEL
+3:
 BEGIN_FTR_SECTION
 	b	2f
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	std	r4,VCPU_DABR(r3)
+	stw	r5, VCPU_DABRX(r3)
+	mtspr	SPRN_DABRX, r5
 	/* Work around P7 bug where DABR can get corrupted on mtspr */
 1:	mtspr	SPRN_DABR,r4
 	mfspr	r5, SPRN_DABR
 	cmpd	r4, r5
 	bne	1b
 	isync
-2:	li	r3,0
+	li	r3,0
+	blr
+
+	/* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
+2:	rlwimi	r5, r4, 5, DAWRX_DR | DAWRX_DW
+	rlwimi	r5, r4, 1, DAWRX_WT
+	clrrdi	r4, r4, 3
+	std	r4, VCPU_DAWR(r3)
+	std	r5, VCPU_DAWRX(r3)
+	mtspr	SPRN_DAWR, r4
+	mtspr	SPRN_DAWRX, r5
+	li	r3, 0
 	blr
 
 _GLOBAL(kvmppc_h_cede)
-- 
cgit v1.2.3


From d682916a381ac7c8eb965c10ab64bc7cc2f18647 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 8 Jan 2014 21:25:30 +1100
Subject: KVM: PPC: Book3S HV: Basic little-endian guest support

We create a guest MSR from scratch when delivering exceptions in
a few places.  Instead of extracting LPCR[ILE] and inserting it
into MSR_LE each time, we simply create a new variable intr_msr which
contains the entire MSR to use.  For a little-endian guest, userspace
needs to set the ILE (interrupt little-endian) bit in the LPCR for
each vcpu (or at least one vcpu in each virtual core).

[paulus@samba.org - removed H_SET_MODE implementation from original
version of the patch, and made kvmppc_set_lpcr update vcpu->arch.intr_msr.]

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h     |  1 +
 arch/powerpc/kernel/asm-offsets.c       |  1 +
 arch/powerpc/kvm/book3s_64_mmu_hv.c     |  2 +-
 arch/powerpc/kvm/book3s_hv.c            | 22 ++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 15 +++++----------
 5 files changed, 30 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index d161bc09153b..7726a3bc8ff0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -636,6 +636,7 @@ struct kvm_vcpu_arch {
 	spinlock_t tbacct_lock;
 	u64 busy_stolen;
 	u64 busy_preempt;
+	unsigned long intr_msr;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 239a857f1141..a60a2fdae5bd 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -480,6 +480,7 @@ int main(void)
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 	DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
+	DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index efb8aa544876..22cc895333e6 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -262,7 +262,7 @@ int kvmppc_mmu_hv_init(void)
 
 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 {
-	kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
+	kvmppc_set_msr(vcpu, vcpu->arch.intr_msr);
 }
 
 /*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index eb4eed4a7173..3195e4f8a2ed 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -787,6 +787,27 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr)
 	u64 mask;
 
 	spin_lock(&vc->lock);
+	/*
+	 * If ILE (interrupt little-endian) has changed, update the
+	 * MSR_LE bit in the intr_msr for each vcpu in this vcore.
+	 */
+	if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
+		struct kvm *kvm = vcpu->kvm;
+		struct kvm_vcpu *vcpu;
+		int i;
+
+		mutex_lock(&kvm->lock);
+		kvm_for_each_vcpu(i, vcpu, kvm) {
+			if (vcpu->arch.vcore != vc)
+				continue;
+			if (new_lpcr & LPCR_ILE)
+				vcpu->arch.intr_msr |= MSR_LE;
+			else
+				vcpu->arch.intr_msr &= ~MSR_LE;
+		}
+		mutex_unlock(&kvm->lock);
+	}
+
 	/*
 	 * Userspace can only modify DPFD (default prefetch depth),
 	 * ILE (interrupt little-endian) and TC (translation control).
@@ -1155,6 +1176,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	spin_lock_init(&vcpu->arch.vpa_update_lock);
 	spin_lock_init(&vcpu->arch.tbacct_lock);
 	vcpu->arch.busy_preempt = TB_NIL;
+	vcpu->arch.intr_msr = MSR_SF | MSR_ME;
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 56299349e94b..ecb76357c9d0 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -812,8 +812,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
 12:	mtspr	SPRN_SRR0, r10
 	mr	r10,r0
 	mtspr	SPRN_SRR1, r11
-	li	r11,(MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
-	rotldi	r11,r11,63
+	ld	r11, VCPU_INTR_MSR(r4)
 5:
 
 /*
@@ -1551,8 +1550,7 @@ kvmppc_hdsi:
 	mtspr	SPRN_SRR0, r10
 	mtspr	SPRN_SRR1, r11
 	li	r10, BOOK3S_INTERRUPT_DATA_STORAGE
-	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
-	rotldi	r11, r11, 63
+	ld	r11, VCPU_INTR_MSR(r9)
 fast_interrupt_c_return:
 6:	ld	r7, VCPU_CTR(r9)
 	lwz	r8, VCPU_XER(r9)
@@ -1621,8 +1619,7 @@ kvmppc_hisi:
 1:	mtspr	SPRN_SRR0, r10
 	mtspr	SPRN_SRR1, r11
 	li	r10, BOOK3S_INTERRUPT_INST_STORAGE
-	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
-	rotldi	r11, r11, 63
+	ld	r11, VCPU_INTR_MSR(r9)
 	b	fast_interrupt_c_return
 
 3:	ld	r6, VCPU_KVM(r9)	/* not relocated, use VRMA */
@@ -1665,8 +1662,7 @@ sc_1_fast_return:
 	mtspr	SPRN_SRR0,r10
 	mtspr	SPRN_SRR1,r11
 	li	r10, BOOK3S_INTERRUPT_SYSCALL
-	li	r11, (MSR_ME << 1) | 1  /* synthesize MSR_SF | MSR_ME */
-	rotldi	r11, r11, 63
+	ld	r11, VCPU_INTR_MSR(r9)
 	mr	r4,r9
 	b	fast_guest_return
 
@@ -1994,8 +1990,7 @@ machine_check_realmode:
 	beq	mc_cont
 	/* If not, deliver a machine check.  SRR0/1 are already set */
 	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK
-	li	r11, (MSR_ME << 1) | 1	/* synthesize MSR_SF | MSR_ME */
-	rotldi	r11, r11, 63
+	ld	r11, VCPU_INTR_MSR(r9)
 	b	fast_interrupt_c_return
 
 /*
-- 
cgit v1.2.3


From 7b490411c37f7ab7965cbdfe5e3ec28eadb6db5b Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 8 Jan 2014 21:25:32 +1100
Subject: KVM: PPC: Book3S HV: Add new state for transactional memory

Add new state for transactional memory (TM) to kvm_vcpu_arch.  Also add
asm-offset bits that are going to be required.

This also moves the existing TFHAR, TFIAR and TEXASR SPRs into a
CONFIG_PPC_TRANSACTIONAL_MEM section.  This requires some code changes to
ensure we still compile with CONFIG_PPC_TRANSACTIONAL_MEM=N.  Much of the added
the added #ifdefs are removed in a later patch when the bulk of the TM code is
added.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
[agraf: fix merge conflict]
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h     | 24 +++++++++--
 arch/powerpc/kernel/asm-offsets.c       | 19 +++++++--
 arch/powerpc/kvm/book3s_hv.c            |  4 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 75 ++++++++++++++++++++++++++++++++-
 4 files changed, 114 insertions(+), 8 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 7726a3bc8ff0..1eaea2dea174 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -475,9 +475,6 @@ struct kvm_vcpu_arch {
 	ulong ppr;
 	ulong pspb;
 	ulong fscr;
-	ulong tfhar;
-	ulong tfiar;
-	ulong texasr;
 	ulong ebbhr;
 	ulong ebbrr;
 	ulong bescr;
@@ -526,6 +523,27 @@ struct kvm_vcpu_arch {
 	u64 siar;
 	u64 sdar;
 	u64 sier;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	u64 tfhar;
+	u64 texasr;
+	u64 tfiar;
+
+	u32 cr_tm;
+	u64 lr_tm;
+	u64 ctr_tm;
+	u64 amr_tm;
+	u64 ppr_tm;
+	u64 dscr_tm;
+	u64 tar_tm;
+
+	ulong gpr_tm[32];
+
+	struct thread_fp_state fp_tm;
+
+	struct thread_vr_state vr_tm;
+	u32 vrsave_tm; /* also USPRG0 */
+
+#endif
 
 #ifdef CONFIG_KVM_EXIT_TIMING
 	struct mutex exit_timing_lock;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index a60a2fdae5bd..687f2ebf0cce 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -521,9 +521,6 @@ int main(void)
 	DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
 	DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr));
 	DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb));
-	DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar));
-	DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar));
-	DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr));
 	DEFINE(VCPU_EBBHR, offsetof(struct kvm_vcpu, arch.ebbhr));
 	DEFINE(VCPU_EBBRR, offsetof(struct kvm_vcpu, arch.ebbrr));
 	DEFINE(VCPU_BESCR, offsetof(struct kvm_vcpu, arch.bescr));
@@ -545,6 +542,22 @@ int main(void)
 	DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige));
 	DEFINE(VCPU_SLB_V, offsetof(struct kvmppc_slb, origv));
 	DEFINE(VCPU_SLB_SIZE, sizeof(struct kvmppc_slb));
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar));
+	DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar));
+	DEFINE(VCPU_TEXASR, offsetof(struct kvm_vcpu, arch.texasr));
+	DEFINE(VCPU_GPR_TM, offsetof(struct kvm_vcpu, arch.gpr_tm));
+	DEFINE(VCPU_FPRS_TM, offsetof(struct kvm_vcpu, arch.fp_tm.fpr));
+	DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr));
+	DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm));
+	DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm));
+	DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm));
+	DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm));
+	DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm));
+	DEFINE(VCPU_PPR_TM, offsetof(struct kvm_vcpu, arch.ppr_tm));
+	DEFINE(VCPU_DSCR_TM, offsetof(struct kvm_vcpu, arch.dscr_tm));
+	DEFINE(VCPU_TAR_TM, offsetof(struct kvm_vcpu, arch.tar_tm));
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3195e4f8a2ed..f4a4c5c82fb2 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -875,6 +875,7 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_IAMR:
 		*val = get_reg_val(id, vcpu->arch.iamr);
 		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	case KVM_REG_PPC_TFHAR:
 		*val = get_reg_val(id, vcpu->arch.tfhar);
 		break;
@@ -884,6 +885,7 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_TEXASR:
 		*val = get_reg_val(id, vcpu->arch.texasr);
 		break;
+#endif
 	case KVM_REG_PPC_FSCR:
 		*val = get_reg_val(id, vcpu->arch.fscr);
 		break;
@@ -1033,6 +1035,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_IAMR:
 		vcpu->arch.iamr = set_reg_val(id, *val);
 		break;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	case KVM_REG_PPC_TFHAR:
 		vcpu->arch.tfhar = set_reg_val(id, *val);
 		break;
@@ -1042,6 +1045,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 	case KVM_REG_PPC_TEXASR:
 		vcpu->arch.texasr = set_reg_val(id, *val);
 		break;
+#endif
 	case KVM_REG_PPC_FSCR:
 		vcpu->arch.fscr = set_reg_val(id, *val);
 		break;
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index ecb76357c9d0..dfa144cb199b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -701,13 +701,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	ld	r6, VCPU_VTB(r4)
 	mtspr	SPRN_IC, r5
 	mtspr	SPRN_VTB, r6
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	ld	r5, VCPU_TFHAR(r4)
 	ld	r6, VCPU_TFIAR(r4)
 	ld	r7, VCPU_TEXASR(r4)
-	ld	r8, VCPU_EBBHR(r4)
 	mtspr	SPRN_TFHAR, r5
 	mtspr	SPRN_TFIAR, r6
 	mtspr	SPRN_TEXASR, r7
+#endif
+	ld	r8, VCPU_EBBHR(r4)
 	mtspr	SPRN_EBBHR, r8
 	ld	r5, VCPU_EBBRR(r4)
 	ld	r6, VCPU_BESCR(r4)
@@ -1118,13 +1120,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	std	r5, VCPU_IC(r9)
 	std	r6, VCPU_VTB(r9)
 	std	r7, VCPU_TAR(r9)
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 	mfspr	r5, SPRN_TFHAR
 	mfspr	r6, SPRN_TFIAR
 	mfspr	r7, SPRN_TEXASR
-	mfspr	r8, SPRN_EBBHR
 	std	r5, VCPU_TFHAR(r9)
 	std	r6, VCPU_TFIAR(r9)
 	std	r7, VCPU_TEXASR(r9)
+#endif
+	mfspr	r8, SPRN_EBBHR
 	std	r8, VCPU_EBBHR(r9)
 	mfspr	r5, SPRN_EBBRR
 	mfspr	r6, SPRN_BESCR
@@ -1497,6 +1501,73 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 1:	addi	r8,r8,16
 	.endr
 
+	/* Save DEC */
+	mfspr	r5,SPRN_DEC
+	mftb	r6
+	extsw	r5,r5
+	add	r5,r5,r6
+	std	r5,VCPU_DEC_EXPIRES(r9)
+
+BEGIN_FTR_SECTION
+	b	8f
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+	/* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
+	mfmsr	r8
+	li	r0, 1
+	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+	mtmsrd	r8
+
+	/* Save POWER8-specific registers */
+	mfspr	r5, SPRN_IAMR
+	mfspr	r6, SPRN_PSPB
+	mfspr	r7, SPRN_FSCR
+	std	r5, VCPU_IAMR(r9)
+	stw	r6, VCPU_PSPB(r9)
+	std	r7, VCPU_FSCR(r9)
+	mfspr	r5, SPRN_IC
+	mfspr	r6, SPRN_VTB
+	mfspr	r7, SPRN_TAR
+	std	r5, VCPU_IC(r9)
+	std	r6, VCPU_VTB(r9)
+	std	r7, VCPU_TAR(r9)
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	mfspr	r5, SPRN_TFHAR
+	mfspr	r6, SPRN_TFIAR
+	mfspr	r7, SPRN_TEXASR
+	std	r5, VCPU_TFHAR(r9)
+	std	r6, VCPU_TFIAR(r9)
+	std	r7, VCPU_TEXASR(r9)
+#endif
+	mfspr	r8, SPRN_EBBHR
+	std	r8, VCPU_EBBHR(r9)
+	mfspr	r5, SPRN_EBBRR
+	mfspr	r6, SPRN_BESCR
+	mfspr	r7, SPRN_CSIGR
+	mfspr	r8, SPRN_TACR
+	std	r5, VCPU_EBBRR(r9)
+	std	r6, VCPU_BESCR(r9)
+	std	r7, VCPU_CSIGR(r9)
+	std	r8, VCPU_TACR(r9)
+	mfspr	r5, SPRN_TCSCR
+	mfspr	r6, SPRN_ACOP
+	mfspr	r7, SPRN_PID
+	mfspr	r8, SPRN_WORT
+	std	r5, VCPU_TCSCR(r9)
+	std	r6, VCPU_ACOP(r9)
+	stw	r7, VCPU_GUEST_PID(r9)
+	std	r8, VCPU_WORT(r9)
+8:
+
+	/* Save and reset AMR and UAMOR before turning on the MMU */
+BEGIN_FTR_SECTION
+	mfspr	r5,SPRN_AMR
+	mfspr	r6,SPRN_UAMOR
+	std	r5,VCPU_AMR(r9)
+	std	r6,VCPU_UAMOR(r9)
+	li	r6,0
+	mtspr	SPRN_AMR,r6
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
+
 	/* Unset guest mode */
 	li	r0, KVM_GUEST_MODE_NONE
 	stb	r0, HSTATE_IN_GUEST(r13)
-- 
cgit v1.2.3


From 4068890931f62752abc3591e7b3736e7537c6dcb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Jan 2014 21:25:36 +1100
Subject: KVM: PPC: Book3S PR: Cope with doorbell interrupts

When the PR host is running on a POWER8 machine in POWER8 mode, it
will use doorbell interrupts for IPIs.  If one of them arrives while
we are in the guest, we pop out of the guest with trap number 0xA00,
which isn't handled by kvmppc_handle_exit_pr, leading to the following
BUG_ON:

[  331.436215] exit_nr=0xa00 | pc=0x1d2c | msr=0x800000000000d032
[  331.437522] ------------[ cut here ]------------
[  331.438296] kernel BUG at arch/powerpc/kvm/book3s_pr.c:982!
[  331.439063] Oops: Exception in kernel mode, sig: 5 [#2]
[  331.439819] SMP NR_CPUS=1024 NUMA pSeries
[  331.440552] Modules linked in: tun nf_conntrack_netbios_ns nf_conntrack_broadcast ipt_MASQUERADE ip6t_REJECT xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw virtio_net kvm binfmt_misc ibmvscsi scsi_transport_srp scsi_tgt virtio_blk
[  331.447614] CPU: 11 PID: 1296 Comm: qemu-system-ppc Tainted: G      D      3.11.7-200.2.fc19.ppc64p7 #1
[  331.448920] task: c0000003bdc8c000 ti: c0000003bd32c000 task.ti: c0000003bd32c000
[  331.450088] NIP: d0000000025d6b9c LR: d0000000025d6b98 CTR: c0000000004cfdd0
[  331.451042] REGS: c0000003bd32f420 TRAP: 0700   Tainted: G      D       (3.11.7-200.2.fc19.ppc64p7)
[  331.452331] MSR: 800000000282b032 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI>  CR: 28004824  XER: 20000000
[  331.454616] SOFTE: 1
[  331.455106] CFAR: c000000000848bb8
[  331.455726]
GPR00: d0000000025d6b98 c0000003bd32f6a0 d0000000026017b8 0000000000000032
GPR04: c0000000018627f8 c000000001873208 320d0a3030303030 3030303030643033
GPR08: c000000000c490a8 0000000000000000 0000000000000000 0000000000000002
GPR12: 0000000028004822 c00000000fdc6300 0000000000000000 00000100076ec310
GPR16: 000000002ae343b8 00003ffffd397398 0000000000000000 0000000000000000
GPR20: 00000100076f16f4 00000100076ebe60 0000000000000008 ffffffffffffffff
GPR24: 0000000000000000 0000008001041e60 0000000000000000 0000008001040ce8
GPR28: c0000003a2d80000 0000000000000a00 0000000000000001 c0000003a2681810
[  331.466504] NIP [d0000000025d6b9c] .kvmppc_handle_exit_pr+0x75c/0xa80 [kvm]
[  331.466999] LR [d0000000025d6b98] .kvmppc_handle_exit_pr+0x758/0xa80 [kvm]
[  331.467517] Call Trace:
[  331.467909] [c0000003bd32f6a0] [d0000000025d6b98] .kvmppc_handle_exit_pr+0x758/0xa80 [kvm] (unreliable)
[  331.468553] [c0000003bd32f750] [d0000000025d98f0] kvm_start_lightweight+0xb4/0xc4 [kvm]
[  331.469189] [c0000003bd32f920] [d0000000025d7648] .kvmppc_vcpu_run_pr+0xd8/0x270 [kvm]
[  331.469838] [c0000003bd32f9c0] [d0000000025cf748] .kvmppc_vcpu_run+0xc8/0xf0 [kvm]
[  331.470790] [c0000003bd32fa50] [d0000000025cc19c] .kvm_arch_vcpu_ioctl_run+0x5c/0x1b0 [kvm]
[  331.471401] [c0000003bd32fae0] [d0000000025c4888] .kvm_vcpu_ioctl+0x478/0x730 [kvm]
[  331.472026] [c0000003bd32fc90] [c00000000026192c] .do_vfs_ioctl+0x4dc/0x7a0
[  331.472561] [c0000003bd32fd80] [c000000000261cc4] .SyS_ioctl+0xd4/0xf0
[  331.473095] [c0000003bd32fe30] [c000000000009ed8] syscall_exit+0x0/0x98
[  331.473633] Instruction dump:
[  331.473766] 4bfff9b4 2b9d0800 419efc18 60000000 60420000 3d220000 e8bf11a0 e8df12a8
[  331.474733] 7fa4eb78 e8698660 48015165 e8410028 <0fe00000> 813f00e4 3ba00000 39290001
[  331.475386] ---[ end trace 49fc47d994c1f8f2 ]---
[  331.479817]

This fixes the problem by making kvmppc_handle_exit_pr() recognize the
interrupt.  We also need to jump to the doorbell interrupt handler in
book3s_segment.S to handle the interrupt on the way out of the guest.
Having done that, there's nothing further to be done in
kvmppc_handle_exit_pr().

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_asm.h | 1 +
 arch/powerpc/kvm/book3s_pr.c       | 1 +
 arch/powerpc/kvm/book3s_segment.S  | 2 ++
 3 files changed, 4 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index c3815b1e79e8..8337c33c2eb3 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -91,6 +91,7 @@
 #define BOOK3S_INTERRUPT_FP_UNAVAIL	0x800
 #define BOOK3S_INTERRUPT_DECREMENTER	0x900
 #define BOOK3S_INTERRUPT_HV_DECREMENTER	0x980
+#define BOOK3S_INTERRUPT_DOORBELL	0xa00
 #define BOOK3S_INTERRUPT_SYSCALL	0xc00
 #define BOOK3S_INTERRUPT_TRACE		0xd00
 #define BOOK3S_INTERRUPT_H_DATA_STORAGE	0xe00
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index e82fafdaf880..6a5fc7d53de6 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -827,6 +827,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* We're good on these - the host merely wanted to get our attention */
 	case BOOK3S_INTERRUPT_DECREMENTER:
 	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+	case BOOK3S_INTERRUPT_DOORBELL:
 		vcpu->stat.dec_exits++;
 		r = RESUME_GUEST;
 		break;
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index bc50c97751d3..1e0cc2adfd40 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -361,6 +361,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	beqa	BOOK3S_INTERRUPT_DECREMENTER
 	cmpwi	r12, BOOK3S_INTERRUPT_PERFMON
 	beqa	BOOK3S_INTERRUPT_PERFMON
+	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL
+	beqa	BOOK3S_INTERRUPT_DOORBELL
 
 	RFI
 kvmppc_handler_trampoline_exit_end:
-- 
cgit v1.2.3


From 3405d230b374b6923878b21b8d708d7db1f734ef Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 15 Jan 2014 18:14:28 +1100
Subject: powerpc: Add support for the optimised lockref implementation

This commit adds the architecture support required to enable the
optimised implementation of lockrefs.

That's as simple as defining arch_spin_value_unlocked() and selecting
the Kconfig option.

We also define cmpxchg64_relaxed(), because the lockref code does not
need the cmpxchg to have barrier semantics.

Using Linus' test case[1] on one system I see a 4x improvement for the
basic enablement, and a further 1.3x for cmpxchg64_relaxed(), for a
total of 5.3x vs the baseline.

On another system I see more like 2x improvement.

[1]: http://marc.info/?l=linux-fsdevel&m=137782380714721&w=4

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/Kconfig                | 1 +
 arch/powerpc/include/asm/cmpxchg.h  | 1 +
 arch/powerpc/include/asm/spinlock.h | 5 +++++
 3 files changed, 7 insertions(+)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index fa395179ddd6..6ca5d5cabeb1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
 	select OLD_SIGACTION if PPC32
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_IRQ_EXIT_ON_IRQ_STACK
+	select ARCH_USE_CMPXCHG_LOCKREF if PPC64
 
 config GENERIC_CSUM
 	def_bool CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index e245aab7f191..d463c68fe7f0 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -300,6 +300,7 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
 	cmpxchg_local((ptr), (o), (n));					\
   })
+#define cmpxchg64_relaxed	cmpxchg64_local
 #else
 #include <asm-generic/cmpxchg-local.h>
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 5f54a744dcc5..5162f8cd18c0 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -54,6 +54,11 @@
 #define SYNC_IO
 #endif
 
+static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
+{
+	return lock.slock == 0;
+}
+
 /*
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
-- 
cgit v1.2.3


From 7179ba52889bef7e5e23f72908270e1ab2b7fc6f Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 15 Jan 2014 18:14:29 +1100
Subject: powerpc: Implement arch_spin_is_locked() using
 arch_spin_value_unlocked()

At a glance these are just the inverse of each other. The one subtlety
is that arch_spin_value_unlocked() takes the lock by value, rather than
as a pointer, which is important for the lockref code.

On the other hand arch_spin_is_locked() doesn't really care, so
implement it in terms of arch_spin_value_unlocked().

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/spinlock.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 5162f8cd18c0..a30ef6999d66 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -28,8 +28,6 @@
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 
-#define arch_spin_is_locked(x)		((x)->slock != 0)
-
 #ifdef CONFIG_PPC64
 /* use 0x800000yy when locked, where yy == CPU number */
 #ifdef __BIG_ENDIAN__
@@ -59,6 +57,11 @@ static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock)
 	return lock.slock == 0;
 }
 
+static inline int arch_spin_is_locked(arch_spinlock_t *lock)
+{
+	return !arch_spin_value_unlocked(*lock);
+}
+
 /*
  * This returns the old value in the lock, so we succeeded
  * in getting the lock if the return value is 0.
-- 
cgit v1.2.3


From fd120dc2e205d2318a8b47d6d8098b789e3af67d Mon Sep 17 00:00:00 2001
From: Li Zhong <zhong@linux.vnet.ibm.com>
Date: Tue, 28 Jan 2014 17:52:42 +0530
Subject: powerpc/mm: Fix compile error of pgtable-ppc64.h

It seems that forward declaration couldn't work well with typedef, use
struct spinlock directly to avoiding following build errors:

In file included from include/linux/spinlock.h:81,
                 from include/linux/seqlock.h:35,
                 from include/linux/time.h:5,
                 from include/uapi/linux/timex.h:56,
                 from include/linux/timex.h:56,
                 from include/linux/sched.h:17,
                 from arch/powerpc/kernel/asm-offsets.c:17:
include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t'
/root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous declaration of 'spinlock_t' was here

Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h
index d27960c89a71..bc141c950b1e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -560,9 +560,9 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 			    pmd_t *pmdp);
 
 #define pmd_move_must_withdraw pmd_move_must_withdraw
-typedef struct spinlock spinlock_t;
-static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
-					 spinlock_t *old_pmd_ptl)
+struct spinlock;
+static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+					 struct spinlock *old_pmd_ptl)
 {
 	/*
 	 * Archs like ppc64 use pgtable to store per pmd
-- 
cgit v1.2.3


From 962e7bd4976516c34fc9ef51d536aab801980767 Mon Sep 17 00:00:00 2001
From: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Date: Tue, 14 Jan 2014 16:26:02 +0530
Subject: powerpc/pseries/cpuidle: Move processor_idle.c to drivers/cpuidle.

Move the file from arch specific pseries/processor_idle.c
to drivers/cpuidle/cpuidle-pseries.c
Make the relevant Makefile and Kconfig changes.
Also, introduce Kconfig.powerpc in drivers/cpuidle
for all powerpc cpuidle drivers.

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/processor.h            |   2 +-
 arch/powerpc/platforms/pseries/Kconfig          |   9 -
 arch/powerpc/platforms/pseries/Makefile         |   1 -
 arch/powerpc/platforms/pseries/processor_idle.c | 308 --------------------
 drivers/cpuidle/Kconfig                         |   5 +
 drivers/cpuidle/Kconfig.powerpc                 |  11 +
 drivers/cpuidle/Makefile                        |   4 +
 drivers/cpuidle/cpuidle-pseries.c               | 361 ++++++++++++++++++++++++
 8 files changed, 382 insertions(+), 319 deletions(-)
 delete mode 100644 arch/powerpc/platforms/pseries/processor_idle.c
 create mode 100644 drivers/cpuidle/Kconfig.powerpc
 create mode 100644 drivers/cpuidle/cpuidle-pseries.c

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index 8ca20ac28dc2..c2c0f4478be3 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -451,7 +451,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 extern int powersave_nap;	/* set if nap mode can be used in idle loop */
 extern void power7_nap(void);
 
-#ifdef CONFIG_PSERIES_IDLE
+#ifdef CONFIG_PSERIES_CPUIDLE
 extern void update_smt_snooze_delay(int cpu, int residency);
 #else
 static inline void update_smt_snooze_delay(int cpu, int residency) {}
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index e66643250fee..37300f6ee244 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -119,12 +119,3 @@ config DTL
 	  which are accessible through a debugfs file.
 
 	  Say N if you are unsure.
-
-config PSERIES_IDLE
-	bool "Cpuidle driver for pSeries platforms"
-	depends on CPU_IDLE
-	depends on PPC_PSERIES
-	default y
-	help
-	  Select this option to enable processor idle state management
-	  through cpuidle subsystem.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index fbccac9cd2dc..03480796af9a 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_HCALL_STATS)	+= hvCall_inst.o
 obj-$(CONFIG_CMM)		+= cmm.o
 obj-$(CONFIG_DTL)		+= dtl.o
 obj-$(CONFIG_IO_EVENT_IRQ)	+= io_event_irq.o
-obj-$(CONFIG_PSERIES_IDLE)	+= processor_idle.o
 obj-$(CONFIG_LPARCFG)		+= lparcfg.o
 
 ifeq ($(CONFIG_PPC_PSERIES),y)
diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c
deleted file mode 100644
index 002d5b4112f2..000000000000
--- a/arch/powerpc/platforms/pseries/processor_idle.c
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- *  processor_idle - idle state cpuidle driver.
- *  Adapted from drivers/idle/intel_idle.c and
- *  drivers/acpi/processor_idle.c
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/moduleparam.h>
-#include <linux/cpuidle.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-
-#include <asm/paca.h>
-#include <asm/reg.h>
-#include <asm/machdep.h>
-#include <asm/firmware.h>
-#include <asm/plpar_wrappers.h>
-
-struct cpuidle_driver pseries_idle_driver = {
-	.name             = "pseries_idle",
-	.owner            = THIS_MODULE,
-};
-
-#define MAX_IDLE_STATE_COUNT	2
-
-static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
-static struct cpuidle_state *cpuidle_state_table;
-
-static inline void idle_loop_prolog(unsigned long *in_purr)
-{
-	*in_purr = mfspr(SPRN_PURR);
-	/*
-	 * Indicate to the HV that we are idle. Now would be
-	 * a good time to find other work to dispatch.
-	 */
-	get_lppaca()->idle = 1;
-}
-
-static inline void idle_loop_epilog(unsigned long in_purr)
-{
-	u64 wait_cycles;
-
-	wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
-	wait_cycles += mfspr(SPRN_PURR) - in_purr;
-	get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
-	get_lppaca()->idle = 0;
-}
-
-static int snooze_loop(struct cpuidle_device *dev,
-			struct cpuidle_driver *drv,
-			int index)
-{
-	unsigned long in_purr;
-	int cpu = dev->cpu;
-
-	idle_loop_prolog(&in_purr);
-	local_irq_enable();
-	set_thread_flag(TIF_POLLING_NRFLAG);
-
-	while ((!need_resched()) && cpu_online(cpu)) {
-		HMT_low();
-		HMT_very_low();
-	}
-
-	HMT_medium();
-	clear_thread_flag(TIF_POLLING_NRFLAG);
-	smp_mb();
-
-	idle_loop_epilog(in_purr);
-
-	return index;
-}
-
-static void check_and_cede_processor(void)
-{
-	/*
-	 * Ensure our interrupt state is properly tracked,
-	 * also checks if no interrupt has occurred while we
-	 * were soft-disabled
-	 */
-	if (prep_irq_for_idle()) {
-		cede_processor();
-#ifdef CONFIG_TRACE_IRQFLAGS
-		/* Ensure that H_CEDE returns with IRQs on */
-		if (WARN_ON(!(mfmsr() & MSR_EE)))
-			__hard_irq_enable();
-#endif
-	}
-}
-
-static int dedicated_cede_loop(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv,
-				int index)
-{
-	unsigned long in_purr;
-
-	idle_loop_prolog(&in_purr);
-	get_lppaca()->donate_dedicated_cpu = 1;
-
-	HMT_medium();
-	check_and_cede_processor();
-
-	get_lppaca()->donate_dedicated_cpu = 0;
-
-	idle_loop_epilog(in_purr);
-
-	return index;
-}
-
-static int shared_cede_loop(struct cpuidle_device *dev,
-			struct cpuidle_driver *drv,
-			int index)
-{
-	unsigned long in_purr;
-
-	idle_loop_prolog(&in_purr);
-
-	/*
-	 * Yield the processor to the hypervisor.  We return if
-	 * an external interrupt occurs (which are driven prior
-	 * to returning here) or if a prod occurs from another
-	 * processor. When returning here, external interrupts
-	 * are enabled.
-	 */
-	check_and_cede_processor();
-
-	idle_loop_epilog(in_purr);
-
-	return index;
-}
-
-/*
- * States for dedicated partition case.
- */
-static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
-	{ /* Snooze */
-		.name = "snooze",
-		.desc = "snooze",
-		.flags = CPUIDLE_FLAG_TIME_VALID,
-		.exit_latency = 0,
-		.target_residency = 0,
-		.enter = &snooze_loop },
-	{ /* CEDE */
-		.name = "CEDE",
-		.desc = "CEDE",
-		.flags = CPUIDLE_FLAG_TIME_VALID,
-		.exit_latency = 10,
-		.target_residency = 100,
-		.enter = &dedicated_cede_loop },
-};
-
-/*
- * States for shared partition case.
- */
-static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
-	{ /* Shared Cede */
-		.name = "Shared Cede",
-		.desc = "Shared Cede",
-		.flags = CPUIDLE_FLAG_TIME_VALID,
-		.exit_latency = 0,
-		.target_residency = 0,
-		.enter = &shared_cede_loop },
-};
-
-void update_smt_snooze_delay(int cpu, int residency)
-{
-	struct cpuidle_driver *drv = cpuidle_get_driver();
-	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
-
-	if (cpuidle_state_table != dedicated_states)
-		return;
-
-	if (residency < 0) {
-		/* Disable the Nap state on that cpu */
-		if (dev)
-			dev->states_usage[1].disable = 1;
-	} else
-		if (drv)
-			drv->states[1].target_residency = residency;
-}
-
-static int pseries_cpuidle_add_cpu_notifier(struct notifier_block *n,
-			unsigned long action, void *hcpu)
-{
-	int hotcpu = (unsigned long)hcpu;
-	struct cpuidle_device *dev =
-			per_cpu_ptr(cpuidle_devices, hotcpu);
-
-	if (dev && cpuidle_get_driver()) {
-		switch (action) {
-		case CPU_ONLINE:
-		case CPU_ONLINE_FROZEN:
-			cpuidle_pause_and_lock();
-			cpuidle_enable_device(dev);
-			cpuidle_resume_and_unlock();
-			break;
-
-		case CPU_DEAD:
-		case CPU_DEAD_FROZEN:
-			cpuidle_pause_and_lock();
-			cpuidle_disable_device(dev);
-			cpuidle_resume_and_unlock();
-			break;
-
-		default:
-			return NOTIFY_DONE;
-		}
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block setup_hotplug_notifier = {
-	.notifier_call = pseries_cpuidle_add_cpu_notifier,
-};
-
-/*
- * pseries_cpuidle_driver_init()
- */
-static int pseries_cpuidle_driver_init(void)
-{
-	int idle_state;
-	struct cpuidle_driver *drv = &pseries_idle_driver;
-
-	drv->state_count = 0;
-
-	for (idle_state = 0; idle_state < MAX_IDLE_STATE_COUNT; ++idle_state) {
-
-		if (idle_state > max_idle_state)
-			break;
-
-		/* is the state not enabled? */
-		if (cpuidle_state_table[idle_state].enter == NULL)
-			continue;
-
-		drv->states[drv->state_count] =	/* structure copy */
-			cpuidle_state_table[idle_state];
-
-		drv->state_count += 1;
-	}
-
-	return 0;
-}
-
-/*
- * pseries_idle_probe()
- * Choose state table for shared versus dedicated partition
- */
-static int pseries_idle_probe(void)
-{
-
-	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
-		return -ENODEV;
-
-	if (cpuidle_disable != IDLE_NO_OVERRIDE)
-		return -ENODEV;
-
-	if (max_idle_state == 0) {
-		printk(KERN_DEBUG "pseries processor idle disabled.\n");
-		return -EPERM;
-	}
-
-	if (lppaca_shared_proc(get_lppaca()))
-		cpuidle_state_table = shared_states;
-	else
-		cpuidle_state_table = dedicated_states;
-
-	return 0;
-}
-
-static int __init pseries_processor_idle_init(void)
-{
-	int retval;
-
-	retval = pseries_idle_probe();
-	if (retval)
-		return retval;
-
-	pseries_cpuidle_driver_init();
-	retval = cpuidle_register(&pseries_idle_driver, NULL);
-	if (retval) {
-		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
-		return retval;
-	}
-
-	register_cpu_notifier(&setup_hotplug_notifier);
-	printk(KERN_DEBUG "pseries_idle_driver registered\n");
-
-	return 0;
-}
-
-static void __exit pseries_processor_idle_exit(void)
-{
-
-	unregister_cpu_notifier(&setup_hotplug_notifier);
-	cpuidle_unregister(&pseries_idle_driver);
-
-	return;
-}
-
-module_init(pseries_processor_idle_init);
-module_exit(pseries_processor_idle_exit);
-
-MODULE_AUTHOR("Deepthi Dharwar <deepthi@linux.vnet.ibm.com>");
-MODULE_DESCRIPTION("Cpuidle driver for POWER");
-MODULE_LICENSE("GPL");
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index b3fb81d7cf04..f04e25f6c98d 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -35,6 +35,11 @@ depends on ARM
 source "drivers/cpuidle/Kconfig.arm"
 endmenu
 
+menu "POWERPC CPU Idle Drivers"
+depends on PPC
+source "drivers/cpuidle/Kconfig.powerpc"
+endmenu
+
 endif
 
 config ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/drivers/cpuidle/Kconfig.powerpc b/drivers/cpuidle/Kconfig.powerpc
new file mode 100644
index 000000000000..8147de522a96
--- /dev/null
+++ b/drivers/cpuidle/Kconfig.powerpc
@@ -0,0 +1,11 @@
+#
+# POWERPC CPU Idle Drivers
+#
+config PSERIES_CPUIDLE
+	bool "Cpuidle driver for pSeries platforms"
+	depends on CPU_IDLE
+	depends on PPC_PSERIES
+	default y
+	help
+	  Select this option to enable processor idle state management
+	  through cpuidle subsystem.
diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index 527be28e5c1e..a6331ad32738 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -13,3 +13,7 @@ obj-$(CONFIG_ARM_KIRKWOOD_CPUIDLE)	+= cpuidle-kirkwood.o
 obj-$(CONFIG_ARM_ZYNQ_CPUIDLE)		+= cpuidle-zynq.o
 obj-$(CONFIG_ARM_U8500_CPUIDLE)         += cpuidle-ux500.o
 obj-$(CONFIG_ARM_AT91_CPUIDLE)          += cpuidle-at91.o
+
+###############################################################################
+# POWERPC drivers
+obj-$(CONFIG_PSERIES_CPUIDLE)		+= cpuidle-pseries.o
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
new file mode 100644
index 000000000000..21154782402a
--- /dev/null
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -0,0 +1,361 @@
+/*
+ *  cpuidle-pseries - idle state cpuidle driver.
+ *  Adapted from drivers/idle/intel_idle.c and
+ *  drivers/acpi/processor_idle.c
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/cpuidle.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+
+#include <asm/paca.h>
+#include <asm/reg.h>
+#include <asm/machdep.h>
+#include <asm/firmware.h>
+#include <asm/plpar_wrappers.h>
+
+struct cpuidle_driver pseries_idle_driver = {
+	.name             = "pseries_idle",
+	.owner            = THIS_MODULE,
+};
+
+#define MAX_IDLE_STATE_COUNT	2
+
+static int max_idle_state = MAX_IDLE_STATE_COUNT - 1;
+static struct cpuidle_device __percpu *pseries_cpuidle_devices;
+static struct cpuidle_state *cpuidle_state_table;
+
+static inline void idle_loop_prolog(unsigned long *in_purr)
+{
+	*in_purr = mfspr(SPRN_PURR);
+	/*
+	 * Indicate to the HV that we are idle. Now would be
+	 * a good time to find other work to dispatch.
+	 */
+	get_lppaca()->idle = 1;
+}
+
+static inline void idle_loop_epilog(unsigned long in_purr)
+{
+	u64 wait_cycles;
+
+	wait_cycles = be64_to_cpu(get_lppaca()->wait_state_cycles);
+	wait_cycles += mfspr(SPRN_PURR) - in_purr;
+	get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
+	get_lppaca()->idle = 0;
+}
+
+static int snooze_loop(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
+			int index)
+{
+	unsigned long in_purr;
+	int cpu = dev->cpu;
+
+	idle_loop_prolog(&in_purr);
+	local_irq_enable();
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
+	while ((!need_resched()) && cpu_online(cpu)) {
+		HMT_low();
+		HMT_very_low();
+	}
+
+	HMT_medium();
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+	smp_mb();
+
+	idle_loop_epilog(in_purr);
+
+	return index;
+}
+
+static void check_and_cede_processor(void)
+{
+	/*
+	 * Ensure our interrupt state is properly tracked,
+	 * also checks if no interrupt has occurred while we
+	 * were soft-disabled
+	 */
+	if (prep_irq_for_idle()) {
+		cede_processor();
+#ifdef CONFIG_TRACE_IRQFLAGS
+		/* Ensure that H_CEDE returns with IRQs on */
+		if (WARN_ON(!(mfmsr() & MSR_EE)))
+			__hard_irq_enable();
+#endif
+	}
+}
+
+static int dedicated_cede_loop(struct cpuidle_device *dev,
+				struct cpuidle_driver *drv,
+				int index)
+{
+	unsigned long in_purr;
+
+	idle_loop_prolog(&in_purr);
+	get_lppaca()->donate_dedicated_cpu = 1;
+
+	HMT_medium();
+	check_and_cede_processor();
+
+	get_lppaca()->donate_dedicated_cpu = 0;
+
+	idle_loop_epilog(in_purr);
+
+	return index;
+}
+
+static int shared_cede_loop(struct cpuidle_device *dev,
+			struct cpuidle_driver *drv,
+			int index)
+{
+	unsigned long in_purr;
+
+	idle_loop_prolog(&in_purr);
+
+	/*
+	 * Yield the processor to the hypervisor.  We return if
+	 * an external interrupt occurs (which are driven prior
+	 * to returning here) or if a prod occurs from another
+	 * processor. When returning here, external interrupts
+	 * are enabled.
+	 */
+	check_and_cede_processor();
+
+	idle_loop_epilog(in_purr);
+
+	return index;
+}
+
+/*
+ * States for dedicated partition case.
+ */
+static struct cpuidle_state dedicated_states[MAX_IDLE_STATE_COUNT] = {
+	{ /* Snooze */
+		.name = "snooze",
+		.desc = "snooze",
+		.flags = CPUIDLE_FLAG_TIME_VALID,
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = &snooze_loop },
+	{ /* CEDE */
+		.name = "CEDE",
+		.desc = "CEDE",
+		.flags = CPUIDLE_FLAG_TIME_VALID,
+		.exit_latency = 10,
+		.target_residency = 100,
+		.enter = &dedicated_cede_loop },
+};
+
+/*
+ * States for shared partition case.
+ */
+static struct cpuidle_state shared_states[MAX_IDLE_STATE_COUNT] = {
+	{ /* Shared Cede */
+		.name = "Shared Cede",
+		.desc = "Shared Cede",
+		.flags = CPUIDLE_FLAG_TIME_VALID,
+		.exit_latency = 0,
+		.target_residency = 0,
+		.enter = &shared_cede_loop },
+};
+
+void update_smt_snooze_delay(int cpu, int residency)
+{
+	struct cpuidle_driver *drv = cpuidle_get_driver();
+	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+
+	if (cpuidle_state_table != dedicated_states)
+		return;
+
+	if (residency < 0) {
+		/* Disable the Nap state on that cpu */
+		if (dev)
+			dev->states_usage[1].disable = 1;
+	} else
+		if (drv)
+			drv->states[1].target_residency = residency;
+}
+
+static int pseries_cpuidle_add_cpu_notifier(struct notifier_block *n,
+			unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct cpuidle_device *dev =
+			per_cpu_ptr(pseries_cpuidle_devices, hotcpu);
+
+	if (dev && cpuidle_get_driver()) {
+		switch (action) {
+		case CPU_ONLINE:
+		case CPU_ONLINE_FROZEN:
+			cpuidle_pause_and_lock();
+			cpuidle_enable_device(dev);
+			cpuidle_resume_and_unlock();
+			break;
+
+		case CPU_DEAD:
+		case CPU_DEAD_FROZEN:
+			cpuidle_pause_and_lock();
+			cpuidle_disable_device(dev);
+			cpuidle_resume_and_unlock();
+			break;
+
+		default:
+			return NOTIFY_DONE;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block setup_hotplug_notifier = {
+	.notifier_call = pseries_cpuidle_add_cpu_notifier,
+};
+
+/*
+ * pseries_cpuidle_driver_init()
+ */
+static int pseries_cpuidle_driver_init(void)
+{
+	int idle_state;
+	struct cpuidle_driver *drv = &pseries_idle_driver;
+
+	drv->state_count = 0;
+
+	for (idle_state = 0; idle_state < MAX_IDLE_STATE_COUNT; ++idle_state) {
+
+		if (idle_state > max_idle_state)
+			break;
+
+		/* is the state not enabled? */
+		if (cpuidle_state_table[idle_state].enter == NULL)
+			continue;
+
+		drv->states[drv->state_count] =	/* structure copy */
+			cpuidle_state_table[idle_state];
+
+		drv->state_count += 1;
+	}
+
+	return 0;
+}
+
+/* pseries_idle_devices_uninit(void)
+ * unregister cpuidle devices and de-allocate memory
+ */
+static void pseries_idle_devices_uninit(void)
+{
+	int i;
+	struct cpuidle_device *dev;
+
+	for_each_possible_cpu(i) {
+		dev = per_cpu_ptr(pseries_cpuidle_devices, i);
+		cpuidle_unregister_device(dev);
+	}
+
+	free_percpu(pseries_cpuidle_devices);
+	return;
+}
+
+/* pseries_idle_devices_init()
+ * allocate, initialize and register cpuidle device
+ */
+static int pseries_idle_devices_init(void)
+{
+	int i;
+	struct cpuidle_driver *drv = &pseries_idle_driver;
+	struct cpuidle_device *dev;
+
+	pseries_cpuidle_devices = alloc_percpu(struct cpuidle_device);
+	if (pseries_cpuidle_devices == NULL)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		dev = per_cpu_ptr(pseries_cpuidle_devices, i);
+		dev->state_count = drv->state_count;
+		dev->cpu = i;
+		if (cpuidle_register_device(dev)) {
+			printk(KERN_DEBUG \
+				"cpuidle_register_device %d failed!\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * pseries_idle_probe()
+ * Choose state table for shared versus dedicated partition
+ */
+static int pseries_idle_probe(void)
+{
+
+	if (!firmware_has_feature(FW_FEATURE_SPLPAR))
+		return -ENODEV;
+
+	if (cpuidle_disable != IDLE_NO_OVERRIDE)
+		return -ENODEV;
+
+	if (max_idle_state == 0) {
+		printk(KERN_DEBUG "pseries processor idle disabled.\n");
+		return -EPERM;
+	}
+
+	if (lppaca_shared_proc(get_lppaca()))
+		cpuidle_state_table = shared_states;
+	else
+		cpuidle_state_table = dedicated_states;
+
+	return 0;
+}
+
+static int __init pseries_processor_idle_init(void)
+{
+	int retval;
+
+	retval = pseries_idle_probe();
+	if (retval)
+		return retval;
+
+	pseries_cpuidle_driver_init();
+	retval = cpuidle_register_driver(&pseries_idle_driver);
+	if (retval) {
+		printk(KERN_DEBUG "Registration of pseries driver failed.\n");
+		return retval;
+	}
+
+	retval = pseries_idle_devices_init();
+	if (retval) {
+		pseries_idle_devices_uninit();
+		cpuidle_unregister_driver(&pseries_idle_driver);
+		return retval;
+	}
+
+	register_cpu_notifier(&setup_hotplug_notifier);
+	printk(KERN_DEBUG "pseries_idle_driver registered\n");
+
+	return 0;
+}
+
+static void __exit pseries_processor_idle_exit(void)
+{
+
+	unregister_cpu_notifier(&setup_hotplug_notifier);
+	pseries_idle_devices_uninit();
+	cpuidle_unregister_driver(&pseries_idle_driver);
+
+	return;
+}
+
+module_init(pseries_processor_idle_init);
+module_exit(pseries_processor_idle_exit);
+
+MODULE_AUTHOR("Deepthi Dharwar <deepthi@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Cpuidle driver for POWER");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 3fa8cad82b94d0bed002571bd246f2299ffc876b Mon Sep 17 00:00:00 2001
From: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Date: Tue, 14 Jan 2014 16:26:38 +0530
Subject: powerpc/pseries/cpuidle: smt-snooze-delay cleanup.

smt-snooze-delay was designed to disable NAP state or delay the entry
to the NAP state prior to adoption of cpuidle framework. This
is per-cpu variable. With the coming of CPUIDLE framework,
states can be disabled on per-cpu basis using the cpuidle/enable
sysfs entry.

Also, with the coming of cpuidle driver each state's target residency
is per-driver unlike earlier which was per-device. Therefore,
the per-cpu sysfs smt-snooze-delay which decides the target residency
of the idle state on a particular cpu causes more confusion to the user
as we cannot have different smt-snooze-delay (target residency)
values for each cpu.

In the current code, smt-snooze-delay functionality is completely broken.
It makes sense to remove smt-snooze-delay from idle driver with the
coming of cpuidle framework.
However, sysfs files are retained as ppc64_util currently
utilises it. Once we fix ppc64_util, propose to clean
up the kernel code.

Signed-off-by: Deepthi Dharwar <deepthi@linux.vnet.ibm.com>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/processor.h |  7 -------
 arch/powerpc/kernel/sysfs.c          |  2 --
 drivers/cpuidle/cpuidle-pseries.c    | 17 -----------------
 3 files changed, 26 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index c2c0f4478be3..b62de43ae5f3 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -450,13 +450,6 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
 
 extern int powersave_nap;	/* set if nap mode can be used in idle loop */
 extern void power7_nap(void);
-
-#ifdef CONFIG_PSERIES_CPUIDLE
-extern void update_smt_snooze_delay(int cpu, int residency);
-#else
-static inline void update_smt_snooze_delay(int cpu, int residency) {}
-#endif
-
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
 extern void poweroff_now(void);
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index d4a43e64a6a9..97e1dc917683 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -51,8 +51,6 @@ static ssize_t store_smt_snooze_delay(struct device *dev,
 		return -EINVAL;
 
 	per_cpu(smt_snooze_delay, cpu->dev.id) = snooze;
-	update_smt_snooze_delay(cpu->dev.id, snooze);
-
 	return count;
 }
 
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index bb56091685d3..7ab564aa0b1c 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -162,23 +162,6 @@ static struct cpuidle_state shared_states[] = {
 		.enter = &shared_cede_loop },
 };
 
-void update_smt_snooze_delay(int cpu, int residency)
-{
-	struct cpuidle_driver *drv = cpuidle_get_driver();
-	struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
-
-	if (cpuidle_state_table != dedicated_states)
-		return;
-
-	if (residency < 0) {
-		/* Disable the Nap state on that cpu */
-		if (dev)
-			dev->states_usage[1].disable = 1;
-	} else
-		if (drv)
-			drv->states[1].target_residency = residency;
-}
-
 static int pseries_cpuidle_add_cpu_notifier(struct notifier_block *n,
 			unsigned long action, void *hcpu)
 {
-- 
cgit v1.2.3


From f878f84373aefda7f041a74b24a83b8b7dec1cf0 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 29 Jan 2014 17:13:05 +1100
Subject: powerpc: Wire up sched_setattr and sched_getattr syscalls

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/systbl.h      | 2 ++
 arch/powerpc/include/asm/unistd.h      | 2 +-
 arch/powerpc/include/uapi/asm/unistd.h | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc/include/asm')

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 43523fe0d8b4..3ddf70276706 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -359,3 +359,5 @@ COMPAT_SYS(process_vm_readv)
 COMPAT_SYS(process_vm_writev)
 SYSCALL(finit_module)
 SYSCALL(ni_syscall) /* sys_kcmp */
+SYSCALL_SPU(sched_setattr)
+SYSCALL_SPU(sched_getattr)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 3ca819f541bf..4494f029b632 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
 #include <uapi/asm/unistd.h>
 
 
-#define __NR_syscalls		355
+#define __NR_syscalls		357
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
diff --git a/arch/powerpc/include/uapi/asm/unistd.h b/arch/powerpc/include/uapi/asm/unistd.h
index 74cb4d72d673..881bf2e2560d 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -377,6 +377,7 @@
 #define __NR_process_vm_writev	352
 #define __NR_finit_module	353
 #define __NR_kcmp		354
-
+#define __NR_sched_setattr	355
+#define __NR_sched_getattr	356
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
-- 
cgit v1.2.3