From 73f10281ea96d7e8b4fc1c5d755a7c8eb484155b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 14 May 2008 06:35:11 +0200
Subject: read_barrier_depends arch fixlets

read_barrie_depends has always been a noop (not a compiler barrier) on all
architectures except SMP alpha. This brings UP alpha and frv into line with all
other architectures, and fixes incorrect documentation.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-alpha/barrier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/asm-alpha')

diff --git a/include/asm-alpha/barrier.h b/include/asm-alpha/barrier.h
index 384dc08d6f53..ac78eba909bc 100644
--- a/include/asm-alpha/barrier.h
+++ b/include/asm-alpha/barrier.h
@@ -24,7 +24,7 @@ __asm__ __volatile__("mb": : :"memory")
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
-#define smp_read_barrier_depends()	barrier()
+#define smp_read_barrier_depends()	do { } while (0)
 #endif
 
 #define set_mb(var, value) \
-- 
cgit v1.2.3


From 362a61ad61199e19a61b8e432015e2586b288f5b Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 14 May 2008 06:37:36 +0200
Subject: fix SMP data race in pagetable setup vs walking

There is a possible data race in the page table walking code. After the split
ptlock patches, it actually seems to have been introduced to the core code, but
even before that I think it would have impacted some architectures (powerpc
and sparc64, at least, walk the page tables without taking locks eg. see
find_linux_pte()).

The race is as follows:
The pte page is allocated, zeroed, and its struct page gets its spinlock
initialized. The mm-wide ptl is then taken, and then the pte page is inserted
into the pagetables.

At this point, the spinlock is not guaranteed to have ordered the previous
stores to initialize the pte page with the subsequent store to put it in the
page tables. So another Linux page table walker might be walking down (without
any locks, because we have split-leaf-ptls), and find that new pte we've
inserted. It might try to take the spinlock before the store from the other
CPU initializes it. And subsequently it might read a pte_t out before stores
from the other CPU have cleared the memory.

There are also similar races in higher levels of the page tables. They
obviously don't involve the spinlock, but could see uninitialized memory.

Arch code and hardware pagetable walkers that walk the pagetables without
locks could see similar uninitialized memory problems, regardless of whether
split ptes are enabled or not.

I prefer to put the barriers in core code, because that's where the higher
level logic happens, but the page table accessors are per-arch, and open-coding
them everywhere I don't think is an option. I'll put the read-side barriers
in alpha arch code for now (other architectures perform data-dependent loads
in order).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-alpha/pgtable.h | 21 +++++++++++++++++++--
 mm/memory.c                 | 21 +++++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/asm-alpha')

diff --git a/include/asm-alpha/pgtable.h b/include/asm-alpha/pgtable.h
index 05ce5fba43e3..3f0c59f6d8aa 100644
--- a/include/asm-alpha/pgtable.h
+++ b/include/asm-alpha/pgtable.h
@@ -287,17 +287,34 @@ extern inline pte_t pte_mkspecial(pte_t pte)	{ return pte; }
 #define pgd_index(address)	(((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
 #define pgd_offset(mm, address)	((mm)->pgd+pgd_index(address))
 
+/*
+ * The smp_read_barrier_depends() in the following functions are required to
+ * order the load of *dir (the pointer in the top level page table) with any
+ * subsequent load of the returned pmd_t *ret (ret is data dependent on *dir).
+ *
+ * If this ordering is not enforced, the CPU might load an older value of
+ * *ret, which may be uninitialized data. See mm/memory.c:__pte_alloc for
+ * more details.
+ *
+ * Note that we never change the mm->pgd pointer after the task is running, so
+ * pgd_offset does not require such a barrier.
+ */
+
 /* Find an entry in the second-level page table.. */
 extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address)
 {
-	return (pmd_t *) pgd_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
+	pmd_t *ret = (pmd_t *) pgd_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
+	smp_read_barrier_depends(); /* see above */
+	return ret;
 }
 
 /* Find an entry in the third-level page table.. */
 extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address)
 {
-	return (pte_t *) pmd_page_vaddr(*dir)
+	pte_t *ret = (pte_t *) pmd_page_vaddr(*dir)
 		+ ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1));
+	smp_read_barrier_depends(); /* see above */
+	return ret;
 }
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir),(addr))
diff --git a/mm/memory.c b/mm/memory.c
index 48c122d42ed7..fb5608a120ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -311,6 +311,21 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	/*
+	 * Ensure all pte setup (eg. pte page lock and page clearing) are
+	 * visible before the pte is made visible to other CPUs by being
+	 * put into page tables.
+	 *
+	 * The other side of the story is the pointer chasing in the page
+	 * table walking code (when walking the page table without locking;
+	 * ie. most of the time). Fortunately, these data accesses consist
+	 * of a chain of data-dependent loads, meaning most CPUs (alpha
+	 * being the notable exception) will already guarantee loads are
+	 * seen in-order. See the alpha page table accessors for the
+	 * smp_read_barrier_depends() barriers in page table walking code.
+	 */
+	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+
 	spin_lock(&mm->page_table_lock);
 	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
 		mm->nr_ptes++;
@@ -329,6 +344,8 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	smp_wmb(); /* See comment in __pte_alloc */
+
 	spin_lock(&init_mm.page_table_lock);
 	if (!pmd_present(*pmd)) {	/* Has another populated it ? */
 		pmd_populate_kernel(&init_mm, pmd, new);
@@ -2619,6 +2636,8 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	smp_wmb(); /* See comment in __pte_alloc */
+
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(mm, new);
@@ -2640,6 +2659,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	smp_wmb(); /* See comment in __pte_alloc */
+
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
 	if (pud_present(*pud))		/* Another has populated it */
-- 
cgit v1.2.3


From b7cffc1f29c1bc729bc50c863c87f93f9b70994b Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 14 May 2008 16:05:42 -0700
Subject: asm-{alpha,h8300,um,v850,xtensa}/param.h: unbreak HZ for userspace

I noticed this because alpha was broken due to the recent commit commit
bdc807871d58285737d50dc6163d0feb72cb0dc2 ("avoid overflows in
kernel/time.c").  Most arches do something like this in their
asm/param.h:

#ifdef __KERNEL__
# define HZ CONFIG_HZ
#else
# define HZ 100
#endif

A few arches though (namely alpha/h8300/um/v850/xtensa) either do no set
HZ at all for !__KERNEL__, or they set it wrongly.  This should bring all
arches in line by setting up HZ for userspace.

Without this currently perl 5.10 doesn't build on alpha:

perl.c: In function 'perl_construct':
perl.c:388: error: 'CONFIG_HZ' undeclared (first use in this function)
-> http://buildd.debian.org/fetch.cgi?pkg=perl;ver=5.10.0-10;arch=alpha;stamp=1210252894

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: maximilian attems <max@stro.at>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
[ HZ on alpha is 1024 for historical reasons.  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-alpha/param.h  | 4 ++++
 include/asm-h8300/param.h  | 8 +++-----
 include/asm-um/param.h     | 2 ++
 include/asm-v850/param.h   | 2 ++
 include/asm-xtensa/param.h | 2 ++
 5 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/asm-alpha')

diff --git a/include/asm-alpha/param.h b/include/asm-alpha/param.h
index 0982f1d39499..e691ecfedb2c 100644
--- a/include/asm-alpha/param.h
+++ b/include/asm-alpha/param.h
@@ -5,8 +5,12 @@
    hardware ignores reprogramming.  We also need userland buy-in to the 
    change in HZ, since this is visible in the wait4 resources etc.  */
 
+#ifdef __KERNEL__
 #define HZ		CONFIG_HZ
 #define USER_HZ		HZ
+#else
+#define HZ		1024
+#endif
 
 #define EXEC_PAGESIZE	8192
 
diff --git a/include/asm-h8300/param.h b/include/asm-h8300/param.h
index 04f64f100379..1c72fb8080ff 100644
--- a/include/asm-h8300/param.h
+++ b/include/asm-h8300/param.h
@@ -1,14 +1,12 @@
 #ifndef _H8300_PARAM_H
 #define _H8300_PARAM_H
 
-
-#ifndef HZ
-#define HZ CONFIG_HZ
-#endif
-
 #ifdef __KERNEL__
+#define HZ		CONFIG_HZ
 #define	USER_HZ		HZ
 #define	CLOCKS_PER_SEC	(USER_HZ)
+#else
+#define HZ		100
 #endif
 
 #define EXEC_PAGESIZE	4096
diff --git a/include/asm-um/param.h b/include/asm-um/param.h
index 4cd4a226f8c1..e44f4e60d16d 100644
--- a/include/asm-um/param.h
+++ b/include/asm-um/param.h
@@ -13,6 +13,8 @@
 #define HZ CONFIG_HZ
 #define USER_HZ	100	   /* .. some user interfaces are in "ticks" */
 #define CLOCKS_PER_SEC (USER_HZ)  /* frequency at which times() counts */
+#else
+#define HZ 100
 #endif
 
 #endif
diff --git a/include/asm-v850/param.h b/include/asm-v850/param.h
index 281832690290..4391f5fe0204 100644
--- a/include/asm-v850/param.h
+++ b/include/asm-v850/param.h
@@ -26,6 +26,8 @@
 # define HZ		CONFIG_HZ
 # define USER_HZ	100
 # define CLOCKS_PER_SEC	USER_HZ
+#else
+# define HZ		100
 #endif
 
 #endif /* __V850_PARAM_H__ */
diff --git a/include/asm-xtensa/param.h b/include/asm-xtensa/param.h
index 82ad34d92d35..ba03d5aeab6b 100644
--- a/include/asm-xtensa/param.h
+++ b/include/asm-xtensa/param.h
@@ -15,6 +15,8 @@
 # define HZ		CONFIG_HZ	/* internal timer frequency */
 # define USER_HZ	100		/* for user interfaces in "ticks" */
 # define CLOCKS_PER_SEC (USER_HZ)	/* frequnzy at which times() counts */
+#else
+# define HZ		100
 #endif
 
 #define EXEC_PAGESIZE	4096
-- 
cgit v1.2.3