From 9457afee85e0dfc2b5075a391d6f34463b4c2b90 Mon Sep 17 00:00:00 2001
From: "Denis V. Lunev" <den@openvz.org>
Date: Thu, 5 Jun 2008 11:23:39 -0700
Subject: netlink: Remove nonblock parameter from netlink_attachskb

Signed-off-by: Denis V. Lunev <den@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 ipc/mqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'ipc')

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index b3b69fd51330..3e84b958186b 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1054,7 +1054,7 @@ retry:
 			}
 
 			timeo = MAX_SCHEDULE_TIMEOUT;
-			ret = netlink_attachskb(sock, nc, 0, &timeo, NULL);
+			ret = netlink_attachskb(sock, nc, &timeo, NULL);
 			if (ret == 1)
 		       		goto retry;
 			if (ret) {
-- 
cgit v1.2.3


From a5516438959d90b071ff0a484ce4f3f523dc3152 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Wed, 23 Jul 2008 21:27:41 -0700
Subject: hugetlb: modular state for hugetlb page size

The goal of this patchset is to support multiple hugetlb page sizes.  This
is achieved by introducing a new struct hstate structure, which
encapsulates the important hugetlb state and constants (eg.  huge page
size, number of huge pages currently allocated, etc).

The hstate structure is then passed around the code which requires these
fields, they will do the right thing regardless of the exact hstate they
are operating on.

This patch adds the hstate structure, with a single global instance of it
(default_hstate), and does the basic work of converting hugetlb to use the
hstate.

Future patches will add more hstate structures to allow for different
hugetlbfs mounts to have different page sizes.

[akpm@linux-foundation.org: coding-style fixes]
Acked-by: Adam Litke <agl@us.ibm.com>
Acked-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/mm/hugetlbpage.c    |   7 +-
 arch/powerpc/mm/hugetlbpage.c |   3 +-
 arch/s390/mm/hugetlbpage.c    |   3 +-
 arch/sh/mm/hugetlbpage.c      |   3 +-
 arch/sparc64/mm/hugetlbpage.c |   5 +-
 arch/x86/mm/hugetlbpage.c     |   5 +-
 fs/hugetlbfs/inode.c          |  52 +++---
 include/asm-ia64/hugetlb.h    |   3 +-
 include/asm-powerpc/hugetlb.h |   3 +-
 include/asm-s390/hugetlb.h    |   3 +-
 include/asm-sh/hugetlb.h      |   3 +-
 include/asm-sparc/hugetlb.h   |   3 +-
 include/asm-x86/hugetlb.h     |   8 +-
 include/linux/hugetlb.h       |  88 +++++++++-
 ipc/shm.c                     |   3 +-
 mm/hugetlb.c                  | 368 +++++++++++++++++++++++-------------------
 mm/memory.c                   |   2 +-
 mm/mempolicy.c                |   9 +-
 mm/mmap.c                     |   3 +-
 19 files changed, 356 insertions(+), 218 deletions(-)

(limited to 'ipc')

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index cd49e2860eef..6170f097d255 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -24,7 +24,7 @@
 unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
 pte_t *
-huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
+huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz)
 {
 	unsigned long taddr = htlbpage_to_page(addr);
 	pgd_t *pgd;
@@ -75,7 +75,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  * Don't actually need to do any preparation, but need to make sure
  * the address is in the right region.
  */
-int prepare_hugepage_range(unsigned long addr, unsigned long len)
+int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
@@ -149,7 +150,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 
 	/* Handle MAP_FIXED */
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1a96cc891cf5..c94dc71af989 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -128,7 +128,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	return NULL;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pg;
 	pud_t *pu;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index f4b6124fdb75..9162dc84f77f 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -72,7 +72,8 @@ void arch_release_hugepage(struct page *page)
 	page[1].index = 0;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index ae8c321d6e2a..2f9dbe0ef4ac 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -22,7 +22,8 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index ebefd2a14375..1307b23f6a76 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -175,7 +175,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
@@ -195,7 +195,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 				pgoff, flags);
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..52476fde8996 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
 	return 1;
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz)
 {
 	pgd_t *pgd;
 	pud_t *pud;
@@ -368,7 +369,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 428eff5b73f3..516c581b5371 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -80,6 +80,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct inode *inode = file->f_path.dentry->d_inode;
 	loff_t len, vma_len;
 	int ret;
+	struct hstate *h = hstate_file(file);
 
 	/*
 	 * vma address alignment (but not the pgoff alignment) has
@@ -92,7 +93,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
 
-	if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+	if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
 		return -EINVAL;
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -104,8 +105,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 
 	if (hugetlb_reserve_pages(inode,
-				vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-				len >> HPAGE_SHIFT, vma))
+				vma->vm_pgoff >> huge_page_order(h),
+				len >> huge_page_shift(h), vma))
 		goto out;
 
 	ret = 0;
@@ -130,20 +131,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
+	struct hstate *h = hstate_file(file);
 
-	if (len & ~HPAGE_MASK)
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
 	if (len > TASK_SIZE)
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len))
+		if (prepare_hugepage_range(file, addr, len))
 			return -EINVAL;
 		return addr;
 	}
 
 	if (addr) {
-		addr = ALIGN(addr, HPAGE_SIZE);
+		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
 		    (!vma || addr + len <= vma->vm_start))
@@ -156,7 +158,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		start_addr = TASK_UNMAPPED_BASE;
 
 full_search:
-	addr = ALIGN(start_addr, HPAGE_SIZE);
+	addr = ALIGN(start_addr, huge_page_size(h));
 
 	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
@@ -174,7 +176,7 @@ full_search:
 
 		if (!vma || addr + len <= vma->vm_start)
 			return addr;
-		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		addr = ALIGN(vma->vm_end, huge_page_size(h));
 	}
 }
 #endif
@@ -225,10 +227,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
 static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			      size_t len, loff_t *ppos)
 {
+	struct hstate *h = hstate_file(filp);
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned long index = *ppos >> HPAGE_SHIFT;
-	unsigned long offset = *ppos & ~HPAGE_MASK;
+	unsigned long index = *ppos >> huge_page_shift(h);
+	unsigned long offset = *ppos & ~huge_page_mask(h);
 	unsigned long end_index;
 	loff_t isize;
 	ssize_t retval = 0;
@@ -243,17 +246,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 	if (!isize)
 		goto out;
 
-	end_index = (isize - 1) >> HPAGE_SHIFT;
+	end_index = (isize - 1) >> huge_page_shift(h);
 	for (;;) {
 		struct page *page;
-		int nr, ret;
+		unsigned long nr, ret;
 
 		/* nr is the maximum number of bytes to copy from this page */
-		nr = HPAGE_SIZE;
+		nr = huge_page_size(h);
 		if (index >= end_index) {
 			if (index > end_index)
 				goto out;
-			nr = ((isize - 1) & ~HPAGE_MASK) + 1;
+			nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
 			if (nr <= offset) {
 				goto out;
 			}
@@ -287,8 +290,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 		offset += ret;
 		retval += ret;
 		len -= ret;
-		index += offset >> HPAGE_SHIFT;
-		offset &= ~HPAGE_MASK;
+		index += offset >> huge_page_shift(h);
+		offset &= ~huge_page_mask(h);
 
 		if (page)
 			page_cache_release(page);
@@ -298,7 +301,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
 			break;
 	}
 out:
-	*ppos = ((loff_t)index << HPAGE_SHIFT) + offset;
+	*ppos = ((loff_t)index << huge_page_shift(h)) + offset;
 	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
@@ -339,8 +342,9 @@ static void truncate_huge_page(struct page *page)
 
 static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+	struct hstate *h = hstate_inode(inode);
 	struct address_space *mapping = &inode->i_data;
-	const pgoff_t start = lstart >> HPAGE_SHIFT;
+	const pgoff_t start = lstart >> huge_page_shift(h);
 	struct pagevec pvec;
 	pgoff_t next;
 	int i, freed = 0;
@@ -449,8 +453,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
 	struct address_space *mapping = inode->i_mapping;
+	struct hstate *h = hstate_inode(inode);
 
-	BUG_ON(offset & ~HPAGE_MASK);
+	BUG_ON(offset & ~huge_page_mask(h));
 	pgoff = offset >> PAGE_SHIFT;
 
 	i_size_write(inode, offset);
@@ -465,6 +470,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = dentry->d_inode;
+	struct hstate *h = hstate_inode(inode);
 	int error;
 	unsigned int ia_valid = attr->ia_valid;
 
@@ -476,7 +482,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (ia_valid & ATTR_SIZE) {
 		error = -EINVAL;
-		if (!(attr->ia_size & ~HPAGE_MASK))
+		if (!(attr->ia_size & ~huge_page_mask(h)))
 			error = hugetlb_vmtruncate(inode, attr->ia_size);
 		if (error)
 			goto out;
@@ -610,9 +616,10 @@ static int hugetlbfs_set_page_dirty(struct page *page)
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+	struct hstate *h = hstate_inode(dentry->d_inode);
 
 	buf->f_type = HUGETLBFS_MAGIC;
-	buf->f_bsize = HPAGE_SIZE;
+	buf->f_bsize = huge_page_size(h);
 	if (sbinfo) {
 		spin_lock(&sbinfo->stat_lock);
 		/* If no limits set, just report 0 for max/free/used
@@ -942,7 +949,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 		goto out_dentry;
 
 	error = -ENOMEM;
-	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL))
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/asm-ia64/hugetlb.h b/include/asm-ia64/hugetlb.h
index e9d1e5e2382d..da55c63728e0 100644
--- a/include/asm-ia64/hugetlb.h
+++ b/include/asm-ia64/hugetlb.h
@@ -8,7 +8,8 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
 			    unsigned long end, unsigned long floor,
 			    unsigned long ceiling);
 
-int prepare_hugepage_range(unsigned long addr, unsigned long len);
+int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len);
 
 static inline int is_hugepage_only_range(struct mm_struct *mm,
 					 unsigned long addr,
diff --git a/include/asm-powerpc/hugetlb.h b/include/asm-powerpc/hugetlb.h
index 0a37aa5ecaa5..ca37c4af27b1 100644
--- a/include/asm-powerpc/hugetlb.h
+++ b/include/asm-powerpc/hugetlb.h
@@ -21,7 +21,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-s390/hugetlb.h b/include/asm-s390/hugetlb.h
index 600a776f8f75..670a1d1745d2 100644
--- a/include/asm-s390/hugetlb.h
+++ b/include/asm-s390/hugetlb.h
@@ -22,7 +22,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-sh/hugetlb.h b/include/asm-sh/hugetlb.h
index fb30018938c7..967068fb79ac 100644
--- a/include/asm-sh/hugetlb.h
+++ b/include/asm-sh/hugetlb.h
@@ -14,7 +14,8 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-sparc/hugetlb.h b/include/asm-sparc/hugetlb.h
index aeb92374ca3d..177061064ee6 100644
--- a/include/asm-sparc/hugetlb.h
+++ b/include/asm-sparc/hugetlb.h
@@ -22,7 +22,8 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
diff --git a/include/asm-x86/hugetlb.h b/include/asm-x86/hugetlb.h
index 7eed6e0883bf..439a9acc132d 100644
--- a/include/asm-x86/hugetlb.h
+++ b/include/asm-x86/hugetlb.h
@@ -14,11 +14,13 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
+static inline int prepare_hugepage_range(struct file *file,
+			unsigned long addr, unsigned long len)
 {
-	if (len & ~HPAGE_MASK)
+	struct hstate *h = hstate_file(file);
+	if (len & ~huge_page_mask(h))
 		return -EINVAL;
-	if (addr & ~HPAGE_MASK)
+	if (addr & ~huge_page_mask(h))
 		return -EINVAL;
 	return 0;
 }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index abbc187193a1..ad2271e11f9b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -8,7 +8,6 @@
 #include <linux/mempolicy.h>
 #include <linux/shm.h>
 #include <asm/tlbflush.h>
-#include <asm/hugetlb.h>
 
 struct ctl_table;
 
@@ -45,7 +44,8 @@ extern int sysctl_hugetlb_shm_group;
 
 /* arch callbacks */
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
+pte_t *huge_pte_alloc(struct mm_struct *mm,
+			unsigned long addr, unsigned long sz);
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
 int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
 struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
@@ -80,7 +80,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define prepare_hugepage_range(addr,len)	(-EINVAL)
+#define prepare_hugepage_range(file, addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
@@ -134,8 +134,6 @@ struct file *hugetlb_file_setup(const char *name, size_t);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
-#define BLOCKS_PER_HUGEPAGE	(HPAGE_SIZE / 512)
-
 static inline int is_file_hugepages(struct file *file)
 {
 	if (file->f_op == &hugetlbfs_file_operations)
@@ -164,4 +162,84 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long flags);
 #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */
 
+#ifdef CONFIG_HUGETLB_PAGE
+
+/* Defines one hugetlb page size */
+struct hstate {
+	int hugetlb_next_nid;
+	unsigned int order;
+	unsigned long mask;
+	unsigned long max_huge_pages;
+	unsigned long nr_huge_pages;
+	unsigned long free_huge_pages;
+	unsigned long resv_huge_pages;
+	unsigned long surplus_huge_pages;
+	unsigned long nr_overcommit_huge_pages;
+	struct list_head hugepage_freelists[MAX_NUMNODES];
+	unsigned int nr_huge_pages_node[MAX_NUMNODES];
+	unsigned int free_huge_pages_node[MAX_NUMNODES];
+	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+};
+
+extern struct hstate default_hstate;
+
+static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
+{
+	return &default_hstate;
+}
+
+static inline struct hstate *hstate_file(struct file *f)
+{
+	return &default_hstate;
+}
+
+static inline struct hstate *hstate_inode(struct inode *i)
+{
+	return &default_hstate;
+}
+
+static inline unsigned long huge_page_size(struct hstate *h)
+{
+	return (unsigned long)PAGE_SIZE << h->order;
+}
+
+static inline unsigned long huge_page_mask(struct hstate *h)
+{
+	return h->mask;
+}
+
+static inline unsigned int huge_page_order(struct hstate *h)
+{
+	return h->order;
+}
+
+static inline unsigned huge_page_shift(struct hstate *h)
+{
+	return h->order + PAGE_SHIFT;
+}
+
+static inline unsigned int pages_per_huge_page(struct hstate *h)
+{
+	return 1 << h->order;
+}
+
+static inline unsigned int blocks_per_huge_page(struct hstate *h)
+{
+	return huge_page_size(h) / 512;
+}
+
+#include <asm/hugetlb.h>
+
+#else
+struct hstate {};
+#define hstate_file(f) NULL
+#define hstate_vma(v) NULL
+#define hstate_inode(i) NULL
+#define huge_page_size(h) PAGE_SIZE
+#define huge_page_mask(h) PAGE_MASK
+#define huge_page_order(h) 0
+#define huge_page_shift(h) PAGE_SHIFT
+#define pages_per_huge_page(h) 1
+#endif
+
 #endif /* _LINUX_HUGETLB_H */
diff --git a/ipc/shm.c b/ipc/shm.c
index 790240cd067f..a726aebce7d7 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -577,7 +577,8 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 
 		if (is_file_hugepages(shp->shm_file)) {
 			struct address_space *mapping = inode->i_mapping;
-			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
+			struct hstate *h = hstate_file(shp->shm_file);
+			*rss += pages_per_huge_page(h) * mapping->nrpages;
 		} else {
 			struct shmem_inode_info *info = SHMEM_I(inode);
 			spin_lock(&info->lock);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 32dff4290c66..0d8153e25f09 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,18 +22,12 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-static unsigned long surplus_huge_pages;
-static unsigned long nr_overcommit_huge_pages;
 unsigned long max_huge_pages;
 unsigned long sysctl_overcommit_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
-static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int hugetlb_next_nid;
+
+struct hstate default_hstate;
 
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -203,11 +197,11 @@ static long region_count(struct list_head *head, long f, long t)
  * Convert the address within this vma to the page offset within
  * the mapping, in pagecache page units; huge pages here.
  */
-static pgoff_t vma_hugecache_offset(struct vm_area_struct *vma,
-					unsigned long address)
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
-	return ((address - vma->vm_start) >> HPAGE_SHIFT) +
-			(vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+	return ((address - vma->vm_start) >> huge_page_shift(h)) +
+			(vma->vm_pgoff >> huge_page_order(h));
 }
 
 /*
@@ -309,20 +303,21 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 }
 
 /* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
+static void decrement_hugepage_resv_vma(struct hstate *h,
+			struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & VM_NORESERVE)
 		return;
 
 	if (vma->vm_flags & VM_SHARED) {
 		/* Shared mappings always use reserves */
-		resv_huge_pages--;
+		h->resv_huge_pages--;
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		/*
 		 * Only the process that called mmap() has reserves for
 		 * private mappings.
 		 */
-		resv_huge_pages--;
+		h->resv_huge_pages--;
 	}
 }
 
@@ -344,12 +339,13 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
 	return 1;
 }
 
-static void clear_huge_page(struct page *page, unsigned long addr)
+static void clear_huge_page(struct page *page,
+			unsigned long addr, unsigned long sz)
 {
 	int i;
 
 	might_sleep();
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+	for (i = 0; i < sz/PAGE_SIZE; i++) {
 		cond_resched();
 		clear_user_highpage(page + i, addr + i * PAGE_SIZE);
 	}
@@ -359,41 +355,43 @@ static void copy_huge_page(struct page *dst, struct page *src,
 			   unsigned long addr, struct vm_area_struct *vma)
 {
 	int i;
+	struct hstate *h = hstate_vma(vma);
 
 	might_sleep();
-	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+	for (i = 0; i < pages_per_huge_page(h); i++) {
 		cond_resched();
 		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
 	}
 }
 
-static void enqueue_huge_page(struct page *page)
+static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
 	int nid = page_to_nid(page);
-	list_add(&page->lru, &hugepage_freelists[nid]);
-	free_huge_pages++;
-	free_huge_pages_node[nid]++;
+	list_add(&page->lru, &h->hugepage_freelists[nid]);
+	h->free_huge_pages++;
+	h->free_huge_pages_node[nid]++;
 }
 
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct hstate *h)
 {
 	int nid;
 	struct page *page = NULL;
 
 	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-		if (!list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		if (!list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
 			break;
 		}
 	}
 	return page;
 }
 
-static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
+static struct page *dequeue_huge_page_vma(struct hstate *h,
+				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve)
 {
 	int nid;
@@ -411,26 +409,26 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	 * not "stolen". The child may still get SIGKILLed
 	 */
 	if (!vma_has_private_reserves(vma) &&
-			free_huge_pages - resv_huge_pages == 0)
+			h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
 	/* If reserves cannot be used, ensure enough pages are in the pool */
-	if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 		return NULL;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		nid = zone_to_nid(zone);
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-		    !list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		    !list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
 
 			if (!avoid_reserve)
-				decrement_hugepage_resv_vma(vma);
+				decrement_hugepage_resv_vma(h, vma);
 
 			break;
 		}
@@ -439,12 +437,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
 	return page;
 }
 
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(struct hstate *h, struct page *page)
 {
 	int i;
-	nr_huge_pages--;
-	nr_huge_pages_node[page_to_nid(page)]--;
-	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+
+	h->nr_huge_pages--;
+	h->nr_huge_pages_node[page_to_nid(page)]--;
+	for (i = 0; i < pages_per_huge_page(h); i++) {
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 				1 << PG_private | 1<< PG_writeback);
@@ -452,11 +451,16 @@ static void update_and_free_page(struct page *page)
 	set_compound_page_dtor(page, NULL);
 	set_page_refcounted(page);
 	arch_release_hugepage(page);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
+	__free_pages(page, huge_page_order(h));
 }
 
 static void free_huge_page(struct page *page)
 {
+	/*
+	 * Can't pass hstate in here because it is called from the
+	 * compound page destructor.
+	 */
+	struct hstate *h = &default_hstate;
 	int nid = page_to_nid(page);
 	struct address_space *mapping;
 
@@ -466,12 +470,12 @@ static void free_huge_page(struct page *page)
 	INIT_LIST_HEAD(&page->lru);
 
 	spin_lock(&hugetlb_lock);
-	if (surplus_huge_pages_node[nid]) {
-		update_and_free_page(page);
-		surplus_huge_pages--;
-		surplus_huge_pages_node[nid]--;
+	if (h->surplus_huge_pages_node[nid]) {
+		update_and_free_page(h, page);
+		h->surplus_huge_pages--;
+		h->surplus_huge_pages_node[nid]--;
 	} else {
-		enqueue_huge_page(page);
+		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
 	if (mapping)
@@ -483,7 +487,7 @@ static void free_huge_page(struct page *page)
  * balanced by operating on them in a round-robin fashion.
  * Returns 1 if an adjustment was made.
  */
-static int adjust_pool_surplus(int delta)
+static int adjust_pool_surplus(struct hstate *h, int delta)
 {
 	static int prev_nid;
 	int nid = prev_nid;
@@ -496,15 +500,15 @@ static int adjust_pool_surplus(int delta)
 			nid = first_node(node_online_map);
 
 		/* To shrink on this node, there must be a surplus page */
-		if (delta < 0 && !surplus_huge_pages_node[nid])
+		if (delta < 0 && !h->surplus_huge_pages_node[nid])
 			continue;
 		/* Surplus cannot exceed the total number of pages */
-		if (delta > 0 && surplus_huge_pages_node[nid] >=
-						nr_huge_pages_node[nid])
+		if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+						h->nr_huge_pages_node[nid])
 			continue;
 
-		surplus_huge_pages += delta;
-		surplus_huge_pages_node[nid] += delta;
+		h->surplus_huge_pages += delta;
+		h->surplus_huge_pages_node[nid] += delta;
 		ret = 1;
 		break;
 	} while (nid != prev_nid);
@@ -513,46 +517,46 @@ static int adjust_pool_surplus(int delta)
 	return ret;
 }
 
-static void prep_new_huge_page(struct page *page, int nid)
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
 	set_compound_page_dtor(page, free_huge_page);
 	spin_lock(&hugetlb_lock);
-	nr_huge_pages++;
-	nr_huge_pages_node[nid]++;
+	h->nr_huge_pages++;
+	h->nr_huge_pages_node[nid]++;
 	spin_unlock(&hugetlb_lock);
 	put_page(page); /* free it into the hugepage allocator */
 }
 
-static struct page *alloc_fresh_huge_page_node(int nid)
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
 
 	page = alloc_pages_node(nid,
 		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
 						__GFP_REPEAT|__GFP_NOWARN,
-		HUGETLB_PAGE_ORDER);
+		huge_page_order(h));
 	if (page) {
 		if (arch_prepare_hugepage(page)) {
 			__free_pages(page, HUGETLB_PAGE_ORDER);
 			return NULL;
 		}
-		prep_new_huge_page(page, nid);
+		prep_new_huge_page(h, page, nid);
 	}
 
 	return page;
 }
 
-static int alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(struct hstate *h)
 {
 	struct page *page;
 	int start_nid;
 	int next_nid;
 	int ret = 0;
 
-	start_nid = hugetlb_next_nid;
+	start_nid = h->hugetlb_next_nid;
 
 	do {
-		page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+		page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
 		if (page)
 			ret = 1;
 		/*
@@ -566,11 +570,11 @@ static int alloc_fresh_huge_page(void)
 		 * if we just successfully allocated a hugepage so that
 		 * the next caller gets hugepages on the next node.
 		 */
-		next_nid = next_node(hugetlb_next_nid, node_online_map);
+		next_nid = next_node(h->hugetlb_next_nid, node_online_map);
 		if (next_nid == MAX_NUMNODES)
 			next_nid = first_node(node_online_map);
-		hugetlb_next_nid = next_nid;
-	} while (!page && hugetlb_next_nid != start_nid);
+		h->hugetlb_next_nid = next_nid;
+	} while (!page && h->hugetlb_next_nid != start_nid);
 
 	if (ret)
 		count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -580,8 +584,8 @@ static int alloc_fresh_huge_page(void)
 	return ret;
 }
 
-static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
-						unsigned long address)
+static struct page *alloc_buddy_huge_page(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
 	struct page *page;
 	unsigned int nid;
@@ -610,18 +614,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 	 * per-node value is checked there.
 	 */
 	spin_lock(&hugetlb_lock);
-	if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
 		spin_unlock(&hugetlb_lock);
 		return NULL;
 	} else {
-		nr_huge_pages++;
-		surplus_huge_pages++;
+		h->nr_huge_pages++;
+		h->surplus_huge_pages++;
 	}
 	spin_unlock(&hugetlb_lock);
 
 	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
 					__GFP_REPEAT|__GFP_NOWARN,
-					HUGETLB_PAGE_ORDER);
+					huge_page_order(h));
 
 	spin_lock(&hugetlb_lock);
 	if (page) {
@@ -636,12 +640,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 		/*
 		 * We incremented the global counters already
 		 */
-		nr_huge_pages_node[nid]++;
-		surplus_huge_pages_node[nid]++;
+		h->nr_huge_pages_node[nid]++;
+		h->surplus_huge_pages_node[nid]++;
 		__count_vm_event(HTLB_BUDDY_PGALLOC);
 	} else {
-		nr_huge_pages--;
-		surplus_huge_pages--;
+		h->nr_huge_pages--;
+		h->surplus_huge_pages--;
 		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
 	}
 	spin_unlock(&hugetlb_lock);
@@ -653,16 +657,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
  * Increase the hugetlb pool such that it can accomodate a reservation
  * of size 'delta'.
  */
-static int gather_surplus_pages(int delta)
+static int gather_surplus_pages(struct hstate *h, int delta)
 {
 	struct list_head surplus_list;
 	struct page *page, *tmp;
 	int ret, i;
 	int needed, allocated;
 
-	needed = (resv_huge_pages + delta) - free_huge_pages;
+	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
 	if (needed <= 0) {
-		resv_huge_pages += delta;
+		h->resv_huge_pages += delta;
 		return 0;
 	}
 
@@ -673,7 +677,7 @@ static int gather_surplus_pages(int delta)
 retry:
 	spin_unlock(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
-		page = alloc_buddy_huge_page(NULL, 0);
+		page = alloc_buddy_huge_page(h, NULL, 0);
 		if (!page) {
 			/*
 			 * We were not able to allocate enough pages to
@@ -694,7 +698,8 @@ retry:
 	 * because either resv_huge_pages or free_huge_pages may have changed.
 	 */
 	spin_lock(&hugetlb_lock);
-	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+	needed = (h->resv_huge_pages + delta) -
+			(h->free_huge_pages + allocated);
 	if (needed > 0)
 		goto retry;
 
@@ -707,7 +712,7 @@ retry:
 	 * before they are reserved.
 	 */
 	needed += allocated;
-	resv_huge_pages += delta;
+	h->resv_huge_pages += delta;
 	ret = 0;
 free:
 	/* Free the needed pages to the hugetlb pool */
@@ -715,7 +720,7 @@ free:
 		if ((--needed) < 0)
 			break;
 		list_del(&page->lru);
-		enqueue_huge_page(page);
+		enqueue_huge_page(h, page);
 	}
 
 	/* Free unnecessary surplus pages to the buddy allocator */
@@ -743,7 +748,8 @@ free:
  * allocated to satisfy the reservation must be explicitly freed if they were
  * never used.
  */
-static void return_unused_surplus_pages(unsigned long unused_resv_pages)
+static void return_unused_surplus_pages(struct hstate *h,
+					unsigned long unused_resv_pages)
 {
 	static int nid = -1;
 	struct page *page;
@@ -758,27 +764,27 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
 	unsigned long remaining_iterations = num_online_nodes();
 
 	/* Uncommit the reservation */
-	resv_huge_pages -= unused_resv_pages;
+	h->resv_huge_pages -= unused_resv_pages;
 
-	nr_pages = min(unused_resv_pages, surplus_huge_pages);
+	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
 
 	while (remaining_iterations-- && nr_pages) {
 		nid = next_node(nid, node_online_map);
 		if (nid == MAX_NUMNODES)
 			nid = first_node(node_online_map);
 
-		if (!surplus_huge_pages_node[nid])
+		if (!h->surplus_huge_pages_node[nid])
 			continue;
 
-		if (!list_empty(&hugepage_freelists[nid])) {
-			page = list_entry(hugepage_freelists[nid].next,
+		if (!list_empty(&h->hugepage_freelists[nid])) {
+			page = list_entry(h->hugepage_freelists[nid].next,
 					  struct page, lru);
 			list_del(&page->lru);
-			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[nid]--;
-			surplus_huge_pages--;
-			surplus_huge_pages_node[nid]--;
+			update_and_free_page(h, page);
+			h->free_huge_pages--;
+			h->free_huge_pages_node[nid]--;
+			h->surplus_huge_pages--;
+			h->surplus_huge_pages_node[nid]--;
 			nr_pages--;
 			remaining_iterations = num_online_nodes();
 		}
@@ -794,13 +800,14 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
  * an instantiated the change should be committed via vma_commit_reservation.
  * No action is required on failure.
  */
-static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
+static int vma_needs_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
 	if (vma->vm_flags & VM_SHARED) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		return region_chg(&inode->i_mapping->private_list,
 							idx, idx + 1);
 
@@ -809,7 +816,7 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
 
 	} else  {
 		int err;
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		err = region_chg(&reservations->regions, idx, idx + 1);
@@ -818,18 +825,18 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
 		return 0;
 	}
 }
-static void vma_commit_reservation(struct vm_area_struct *vma,
-							unsigned long addr)
+static void vma_commit_reservation(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long addr)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
 
 	if (vma->vm_flags & VM_SHARED) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		region_add(&inode->i_mapping->private_list, idx, idx + 1);
 
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-		pgoff_t idx = vma_hugecache_offset(vma, addr);
+		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
 		struct resv_map *reservations = vma_resv_map(vma);
 
 		/* Mark this page used in the map. */
@@ -840,6 +847,7 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
 				    unsigned long addr, int avoid_reserve)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct page *page;
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@@ -852,7 +860,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	 * MAP_NORESERVE mappings may also need pages and quota allocated
 	 * if no reserve mapping overlaps.
 	 */
-	chg = vma_needs_reservation(vma, addr);
+	chg = vma_needs_reservation(h, vma, addr);
 	if (chg < 0)
 		return ERR_PTR(chg);
 	if (chg)
@@ -860,11 +868,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 			return ERR_PTR(-ENOSPC);
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
+	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
 	spin_unlock(&hugetlb_lock);
 
 	if (!page) {
-		page = alloc_buddy_huge_page(vma, addr);
+		page = alloc_buddy_huge_page(h, vma, addr);
 		if (!page) {
 			hugetlb_put_quota(inode->i_mapping, chg);
 			return ERR_PTR(-VM_FAULT_OOM);
@@ -874,7 +882,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	set_page_refcounted(page);
 	set_page_private(page, (unsigned long) mapping);
 
-	vma_commit_reservation(vma, addr);
+	vma_commit_reservation(h, vma, addr);
 
 	return page;
 }
@@ -882,21 +890,28 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
+	struct hstate *h = &default_hstate;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
 
+	if (!h->order) {
+		h->order = HPAGE_SHIFT - PAGE_SHIFT;
+		h->mask = HPAGE_MASK;
+	}
+
 	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
+		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
 
-	hugetlb_next_nid = first_node(node_online_map);
+	h->hugetlb_next_nid = first_node(node_online_map);
 
 	for (i = 0; i < max_huge_pages; ++i) {
-		if (!alloc_fresh_huge_page())
+		if (!alloc_fresh_huge_page(h))
 			break;
 	}
-	max_huge_pages = free_huge_pages = nr_huge_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i;
+	printk(KERN_INFO "Total HugeTLB memory allocated, %ld\n",
+			h->free_huge_pages);
 	return 0;
 }
 module_init(hugetlb_init);
@@ -922,34 +937,36 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count)
 {
 	int i;
 
 	for (i = 0; i < MAX_NUMNODES; ++i) {
 		struct page *page, *next;
-		list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
-			if (count >= nr_huge_pages)
+		struct list_head *freel = &h->hugepage_freelists[i];
+		list_for_each_entry_safe(page, next, freel, lru) {
+			if (count >= h->nr_huge_pages)
 				return;
 			if (PageHighMem(page))
 				continue;
 			list_del(&page->lru);
 			update_and_free_page(page);
-			free_huge_pages--;
-			free_huge_pages_node[page_to_nid(page)]--;
+			h->free_huge_pages--;
+			h->free_huge_pages_node[page_to_nid(page)]--;
 		}
 	}
 }
 #else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
 {
 }
 #endif
 
-#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	unsigned long min_count, ret;
+	struct hstate *h = &default_hstate;
 
 	/*
 	 * Increase the pool size
@@ -963,19 +980,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	 * within all the constraints specified by the sysctls.
 	 */
 	spin_lock(&hugetlb_lock);
-	while (surplus_huge_pages && count > persistent_huge_pages) {
-		if (!adjust_pool_surplus(-1))
+	while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
+		if (!adjust_pool_surplus(h, -1))
 			break;
 	}
 
-	while (count > persistent_huge_pages) {
+	while (count > persistent_huge_pages(h)) {
 		/*
 		 * If this allocation races such that we no longer need the
 		 * page, free_huge_page will handle it by freeing the page
 		 * and reducing the surplus.
 		 */
 		spin_unlock(&hugetlb_lock);
-		ret = alloc_fresh_huge_page();
+		ret = alloc_fresh_huge_page(h);
 		spin_lock(&hugetlb_lock);
 		if (!ret)
 			goto out;
@@ -997,21 +1014,21 @@ static unsigned long set_max_huge_pages(unsigned long count)
 	 * and won't grow the pool anywhere else. Not until one of the
 	 * sysctls are changed, or the surplus pages go out of use.
 	 */
-	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+	min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
 	min_count = max(count, min_count);
-	try_to_free_low(min_count);
-	while (min_count < persistent_huge_pages) {
-		struct page *page = dequeue_huge_page();
+	try_to_free_low(h, min_count);
+	while (min_count < persistent_huge_pages(h)) {
+		struct page *page = dequeue_huge_page(h);
 		if (!page)
 			break;
-		update_and_free_page(page);
+		update_and_free_page(h, page);
 	}
-	while (count < persistent_huge_pages) {
-		if (!adjust_pool_surplus(1))
+	while (count < persistent_huge_pages(h)) {
+		if (!adjust_pool_surplus(h, 1))
 			break;
 	}
 out:
-	ret = persistent_huge_pages;
+	ret = persistent_huge_pages(h);
 	spin_unlock(&hugetlb_lock);
 	return ret;
 }
@@ -1041,9 +1058,10 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 			struct file *file, void __user *buffer,
 			size_t *length, loff_t *ppos)
 {
+	struct hstate *h = &default_hstate;
 	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
 	spin_lock(&hugetlb_lock);
-	nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
+	h->nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
 	spin_unlock(&hugetlb_lock);
 	return 0;
 }
@@ -1052,37 +1070,40 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 
 int hugetlb_report_meminfo(char *buf)
 {
+	struct hstate *h = &default_hstate;
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
 			"HugePages_Rsvd:  %5lu\n"
 			"HugePages_Surp:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
-			nr_huge_pages,
-			free_huge_pages,
-			resv_huge_pages,
-			surplus_huge_pages,
-			HPAGE_SIZE/1024);
+			h->nr_huge_pages,
+			h->free_huge_pages,
+			h->resv_huge_pages,
+			h->surplus_huge_pages,
+			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 
 int hugetlb_report_node_meminfo(int nid, char *buf)
 {
+	struct hstate *h = &default_hstate;
 	return sprintf(buf,
 		"Node %d HugePages_Total: %5u\n"
 		"Node %d HugePages_Free:  %5u\n"
 		"Node %d HugePages_Surp:  %5u\n",
-		nid, nr_huge_pages_node[nid],
-		nid, free_huge_pages_node[nid],
-		nid, surplus_huge_pages_node[nid]);
+		nid, h->nr_huge_pages_node[nid],
+		nid, h->free_huge_pages_node[nid],
+		nid, h->surplus_huge_pages_node[nid]);
 }
 
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+	struct hstate *h = &default_hstate;
+	return h->nr_huge_pages * pages_per_huge_page(h);
 }
 
-static int hugetlb_acct_memory(long delta)
+static int hugetlb_acct_memory(struct hstate *h, long delta)
 {
 	int ret = -ENOMEM;
 
@@ -1105,18 +1126,18 @@ static int hugetlb_acct_memory(long delta)
 	 * semantics that cpuset has.
 	 */
 	if (delta > 0) {
-		if (gather_surplus_pages(delta) < 0)
+		if (gather_surplus_pages(h, delta) < 0)
 			goto out;
 
-		if (delta > cpuset_mems_nr(free_huge_pages_node)) {
-			return_unused_surplus_pages(delta);
+		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+			return_unused_surplus_pages(h, delta);
 			goto out;
 		}
 	}
 
 	ret = 0;
 	if (delta < 0)
-		return_unused_surplus_pages((unsigned long) -delta);
+		return_unused_surplus_pages(h, (unsigned long) -delta);
 
 out:
 	spin_unlock(&hugetlb_lock);
@@ -1141,14 +1162,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct resv_map *reservations = vma_resv_map(vma);
 	unsigned long reserve;
 	unsigned long start;
 	unsigned long end;
 
 	if (reservations) {
-		start = vma_hugecache_offset(vma, vma->vm_start);
-		end = vma_hugecache_offset(vma, vma->vm_end);
+		start = vma_hugecache_offset(h, vma, vma->vm_start);
+		end = vma_hugecache_offset(h, vma, vma->vm_end);
 
 		reserve = (end - start) -
 			region_count(&reservations->regions, start, end);
@@ -1156,7 +1178,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 		kref_put(&reservations->refs, resv_map_release);
 
 		if (reserve)
-			hugetlb_acct_memory(-reserve);
+			hugetlb_acct_memory(h, -reserve);
 	}
 }
 
@@ -1214,14 +1236,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	struct page *ptepage;
 	unsigned long addr;
 	int cow;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
 
 	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 
-	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
 		src_pte = huge_pte_offset(src, addr);
 		if (!src_pte)
 			continue;
-		dst_pte = huge_pte_alloc(dst, addr);
+		dst_pte = huge_pte_alloc(dst, addr, sz);
 		if (!dst_pte)
 			goto nomem;
 
@@ -1257,6 +1281,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	pte_t pte;
 	struct page *page;
 	struct page *tmp;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
+
 	/*
 	 * A page gathering list, protected by per file i_mmap_lock. The
 	 * lock is used to avoid list corruption from multiple unmapping
@@ -1265,11 +1292,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	LIST_HEAD(page_list);
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
-	BUG_ON(start & ~HPAGE_MASK);
-	BUG_ON(end & ~HPAGE_MASK);
+	BUG_ON(start & ~huge_page_mask(h));
+	BUG_ON(end & ~huge_page_mask(h));
 
 	spin_lock(&mm->page_table_lock);
-	for (address = start; address < end; address += HPAGE_SIZE) {
+	for (address = start; address < end; address += sz) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
@@ -1383,6 +1410,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte,
 			struct page *pagecache_page)
 {
+	struct hstate *h = hstate_vma(vma);
 	struct page *old_page, *new_page;
 	int avoidcopy;
 	int outside_reserve = 0;
@@ -1443,7 +1471,7 @@ retry_avoidcopy:
 	__SetPageUptodate(new_page);
 	spin_lock(&mm->page_table_lock);
 
-	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
 		/* Break COW */
 		huge_ptep_clear_flush(vma, address, ptep);
@@ -1458,14 +1486,14 @@ retry_avoidcopy:
 }
 
 /* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
-			unsigned long address)
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+			struct vm_area_struct *vma, unsigned long address)
 {
 	struct address_space *mapping;
 	pgoff_t idx;
 
 	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(vma, address);
+	idx = vma_hugecache_offset(h, vma, address);
 
 	return find_lock_page(mapping, idx);
 }
@@ -1473,6 +1501,7 @@ static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, int write_access)
 {
+	struct hstate *h = hstate_vma(vma);
 	int ret = VM_FAULT_SIGBUS;
 	pgoff_t idx;
 	unsigned long size;
@@ -1493,7 +1522,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	mapping = vma->vm_file->f_mapping;
-	idx = vma_hugecache_offset(vma, address);
+	idx = vma_hugecache_offset(h, vma, address);
 
 	/*
 	 * Use page lock to guard against racing truncation
@@ -1502,7 +1531,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 retry:
 	page = find_lock_page(mapping, idx);
 	if (!page) {
-		size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+		size = i_size_read(mapping->host) >> huge_page_shift(h);
 		if (idx >= size)
 			goto out;
 		page = alloc_huge_page(vma, address, 0);
@@ -1510,7 +1539,7 @@ retry:
 			ret = -PTR_ERR(page);
 			goto out;
 		}
-		clear_huge_page(page, address);
+		clear_huge_page(page, address, huge_page_size(h));
 		__SetPageUptodate(page);
 
 		if (vma->vm_flags & VM_SHARED) {
@@ -1526,14 +1555,14 @@ retry:
 			}
 
 			spin_lock(&inode->i_lock);
-			inode->i_blocks += BLOCKS_PER_HUGEPAGE;
+			inode->i_blocks += blocks_per_huge_page(h);
 			spin_unlock(&inode->i_lock);
 		} else
 			lock_page(page);
 	}
 
 	spin_lock(&mm->page_table_lock);
-	size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+	size = i_size_read(mapping->host) >> huge_page_shift(h);
 	if (idx >= size)
 		goto backout;
 
@@ -1569,8 +1598,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret;
 	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+	struct hstate *h = hstate_vma(vma);
 
-	ptep = huge_pte_alloc(mm, address);
+	ptep = huge_pte_alloc(mm, address, huge_page_size(h));
 	if (!ptep)
 		return VM_FAULT_OOM;
 
@@ -1594,7 +1624,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (likely(pte_same(entry, huge_ptep_get(ptep))))
 		if (write_access && !pte_write(entry)) {
 			struct page *page;
-			page = hugetlbfs_pagecache_page(vma, address);
+			page = hugetlbfs_pagecache_page(h, vma, address);
 			ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
 			if (page) {
 				unlock_page(page);
@@ -1615,6 +1645,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	unsigned long pfn_offset;
 	unsigned long vaddr = *position;
 	int remainder = *length;
+	struct hstate *h = hstate_vma(vma);
 
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
@@ -1626,7 +1657,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		 * each hugepage.  We have to make * sure we get the
 		 * first, for the page indexing below to work.
 		 */
-		pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+		pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
 
 		if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
 		    (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1644,7 +1675,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
 		page = pte_page(huge_ptep_get(pte));
 same_page:
 		if (pages) {
@@ -1660,7 +1691,7 @@ same_page:
 		--remainder;
 		++i;
 		if (vaddr < vma->vm_end && remainder &&
-				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+				pfn_offset < pages_per_huge_page(h)) {
 			/*
 			 * We use pfn_offset to avoid touching the pageframes
 			 * of this compound page.
@@ -1682,13 +1713,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 	unsigned long start = address;
 	pte_t *ptep;
 	pte_t pte;
+	struct hstate *h = hstate_vma(vma);
 
 	BUG_ON(address >= end);
 	flush_cache_range(vma, address, end);
 
 	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
 	spin_lock(&mm->page_table_lock);
-	for (; address < end; address += HPAGE_SIZE) {
+	for (; address < end; address += huge_page_size(h)) {
 		ptep = huge_pte_offset(mm, address);
 		if (!ptep)
 			continue;
@@ -1711,6 +1743,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 					struct vm_area_struct *vma)
 {
 	long ret, chg;
+	struct hstate *h = hstate_inode(inode);
 
 	if (vma && vma->vm_flags & VM_NORESERVE)
 		return 0;
@@ -1739,7 +1772,7 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 	if (hugetlb_get_quota(inode->i_mapping, chg))
 		return -ENOSPC;
-	ret = hugetlb_acct_memory(chg);
+	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
 		return ret;
@@ -1751,12 +1784,13 @@ int hugetlb_reserve_pages(struct inode *inode,
 
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
+	struct hstate *h = hstate_inode(inode);
 	long chg = region_truncate(&inode->i_mapping->private_list, offset);
 
 	spin_lock(&inode->i_lock);
-	inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
+	inode->i_blocks -= blocks_per_huge_page(h);
 	spin_unlock(&inode->i_lock);
 
 	hugetlb_put_quota(inode->i_mapping, (chg - freed));
-	hugetlb_acct_memory(-(chg - freed));
+	hugetlb_acct_memory(h, -(chg - freed));
 }
diff --git a/mm/memory.c b/mm/memory.c
index 72932489a082..c1c1d6d8c22b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -903,7 +903,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
 			if (unlikely(is_vm_hugetlb_page(vma))) {
 				unmap_hugepage_range(vma, start, end, NULL);
 				zap_work -= (end - start) /
-						(HPAGE_SIZE / PAGE_SIZE);
+					pages_per_huge_page(hstate_vma(vma));
 				start = end;
 			} else
 				start = unmap_page_range(*tlbp, vma,
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..e550bec20582 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 
 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
 		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-						HPAGE_SHIFT), gfp_flags);
+				huge_page_shift(hstate_vma(vma))), gfp_flags);
 	} else {
 		zl = policy_zonelist(gfp_flags, *mpol);
 		if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
 {
 	unsigned long addr;
 	struct page *page;
+	struct hstate *h = hstate_vma(vma);
+	unsigned long sz = huge_page_size(h);
 
-	for (addr = start; addr < end; addr += HPAGE_SIZE) {
-		pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+	for (addr = start; addr < end; addr += sz) {
+		pte_t *ptep = huge_pte_offset(vma->vm_mm,
+						addr & huge_page_mask(h));
 		pte_t pte;
 
 		if (!ptep)
diff --git a/mm/mmap.c b/mm/mmap.c
index 57d3b6097deb..5e0cc99e9cd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1812,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	struct mempolicy *pol;
 	struct vm_area_struct *new;
 
-	if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+	if (is_vm_hugetlb_page(vma) && (addr &
+					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
 
 	if (mm->map_count >= sysctl_max_map_count)
-- 
cgit v1.2.3


From 983bfb7db303cfde56ae5bbf4e0f2f46e38c9576 Mon Sep 17 00:00:00 2001
From: Nadia Derbey <Nadia.Derbey@bull.net>
Date: Fri, 25 Jul 2008 01:48:03 -0700
Subject: ipc: call idr_find() without locking in ipc_lock()

Call idr_find() locklessly from ipc_lock(), since the idr tree is now RCU
protected.

Signed-off-by: Nadia Derbey <Nadia.Derbey@bull.net>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Jim Houston <jim.houston@comcast.net>
Cc: Pierre Peiffer <peifferp@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'ipc')

diff --git a/ipc/util.c b/ipc/util.c
index 3339177b336c..0f468c34e83c 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -688,10 +688,6 @@ void ipc64_perm_to_ipc_perm (struct ipc64_perm *in, struct ipc_perm *out)
  * Look for an id in the ipc ids idr and lock the associated ipc object.
  *
  * The ipc object is locked on exit.
- *
- * This is the routine that should be called when the rw_mutex is not already
- * held, i.e. idr tree not protected: it protects the idr tree in read mode
- * during the idr_find().
  */
 
 struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
@@ -699,18 +695,13 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
 	struct kern_ipc_perm *out;
 	int lid = ipcid_to_idx(id);
 
-	down_read(&ids->rw_mutex);
-
 	rcu_read_lock();
 	out = idr_find(&ids->ipcs_idr, lid);
 	if (out == NULL) {
 		rcu_read_unlock();
-		up_read(&ids->rw_mutex);
 		return ERR_PTR(-EINVAL);
 	}
 
-	up_read(&ids->rw_mutex);
-
 	spin_lock(&out->lock);
 	
 	/* ipc_rmid() may have already freed the ID while ipc_lock
-- 
cgit v1.2.3


From 00c2bf85d8febfcfddde63822043462b026134ff Mon Sep 17 00:00:00 2001
From: Nadia Derbey <Nadia.Derbey@bull.net>
Date: Fri, 25 Jul 2008 01:48:03 -0700
Subject: ipc: get rid of ipc_lock_down()

Remove the ipc_lock_down() routines: they used to call idr_find() locklessly
(given that the ipc ids lock was already held), so they are not needed
anymore.

Signed-off-by: Nadia Derbey <Nadia.Derbey@bull.net>
Acked-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Jim Houston <jim.houston@comcast.net>
Cc: Pierre Peiffer <peifferp@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c  | 21 +++------------------
 ipc/util.c | 52 +---------------------------------------------------
 ipc/util.h |  6 ------
 3 files changed, 4 insertions(+), 75 deletions(-)

(limited to 'ipc')

diff --git a/ipc/shm.c b/ipc/shm.c
index a726aebce7d7..e77ec698cf40 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -111,24 +111,9 @@ void __init shm_init (void)
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 
-/*
- * shm_lock_(check_)down routines are called in the paths where the rw_mutex
- * is held to protect access to the idr tree.
- */
-static inline struct shmid_kernel *shm_lock_down(struct ipc_namespace *ns,
-						int id)
-{
-	struct kern_ipc_perm *ipcp = ipc_lock_down(&shm_ids(ns), id);
-
-	if (IS_ERR(ipcp))
-		return (struct shmid_kernel *)ipcp;
-
-	return container_of(ipcp, struct shmid_kernel, shm_perm);
-}
-
 /*
  * shm_lock_(check_) routines are called in the paths where the rw_mutex
- * is not held.
+ * is not necessarily held.
  */
 static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
 {
@@ -211,7 +196,7 @@ static void shm_close(struct vm_area_struct *vma)
 
 	down_write(&shm_ids(ns).rw_mutex);
 	/* remove from the list of attaches of the shm segment */
-	shp = shm_lock_down(ns, sfd->id);
+	shp = shm_lock(ns, sfd->id);
 	BUG_ON(IS_ERR(shp));
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_dtim = get_seconds();
@@ -932,7 +917,7 @@ invalid:
 
 out_nattch:
 	down_write(&shm_ids(ns).rw_mutex);
-	shp = shm_lock_down(ns, shmid);
+	shp = shm_lock(ns, shmid);
 	BUG_ON(IS_ERR(shp));
 	shp->shm_nattch--;
 	if(shp->shm_nattch == 0 &&
diff --git a/ipc/util.c b/ipc/util.c
index 0f468c34e83c..49b3ea615dc5 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -716,56 +716,6 @@ struct kern_ipc_perm *ipc_lock(struct ipc_ids *ids, int id)
 	return out;
 }
 
-/**
- * ipc_lock_down - Lock an ipc structure with rw_sem held
- * @ids: IPC identifier set
- * @id: ipc id to look for
- *
- * Look for an id in the ipc ids idr and lock the associated ipc object.
- *
- * The ipc object is locked on exit.
- *
- * This is the routine that should be called when the rw_mutex is already
- * held, i.e. idr tree protected.
- */
-
-struct kern_ipc_perm *ipc_lock_down(struct ipc_ids *ids, int id)
-{
-	struct kern_ipc_perm *out;
-	int lid = ipcid_to_idx(id);
-
-	rcu_read_lock();
-	out = idr_find(&ids->ipcs_idr, lid);
-	if (out == NULL) {
-		rcu_read_unlock();
-		return ERR_PTR(-EINVAL);
-	}
-
-	spin_lock(&out->lock);
-
-	/*
-	 * No need to verify that the structure is still valid since the
-	 * rw_mutex is held.
-	 */
-	return out;
-}
-
-struct kern_ipc_perm *ipc_lock_check_down(struct ipc_ids *ids, int id)
-{
-	struct kern_ipc_perm *out;
-
-	out = ipc_lock_down(ids, id);
-	if (IS_ERR(out))
-		return out;
-
-	if (ipc_checkid(out, id)) {
-		ipc_unlock(out);
-		return ERR_PTR(-EIDRM);
-	}
-
-	return out;
-}
-
 struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id)
 {
 	struct kern_ipc_perm *out;
@@ -837,7 +787,7 @@ struct kern_ipc_perm *ipcctl_pre_down(struct ipc_ids *ids, int id, int cmd,
 	int err;
 
 	down_write(&ids->rw_mutex);
-	ipcp = ipc_lock_check_down(ids, id);
+	ipcp = ipc_lock_check(ids, id);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
 		goto out_up;
diff --git a/ipc/util.h b/ipc/util.h
index cdb966aebe07..3646b45a03c9 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -102,11 +102,6 @@ void* ipc_rcu_alloc(int size);
 void ipc_rcu_getref(void *ptr);
 void ipc_rcu_putref(void *ptr);
 
-/*
- * ipc_lock_down: called with rw_mutex held
- * ipc_lock: called without that lock held
- */
-struct kern_ipc_perm *ipc_lock_down(struct ipc_ids *, int);
 struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
 
 void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
@@ -155,7 +150,6 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
 	rcu_read_unlock();
 }
 
-struct kern_ipc_perm *ipc_lock_check_down(struct ipc_ids *ids, int id);
 struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
 int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
 			struct ipc_ops *ops, struct ipc_params *params);
-- 
cgit v1.2.3


From 4daa28f6d8f5cda8ea0f55048e3c8811c384cbdd Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Fri, 25 Jul 2008 01:48:04 -0700
Subject: ipc/sem.c: convert undo structures to struct list_head

The undo structures contain two linked lists, the attached patch replaces
them with generic struct list_head lists.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h |  12 ++--
 ipc/sem.c           | 163 ++++++++++++++++++++++++++++------------------------
 2 files changed, 95 insertions(+), 80 deletions(-)

(limited to 'ipc')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index c8eaad9e4b72..6a1af1b49a13 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -95,7 +95,7 @@ struct sem_array {
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
 	struct sem_queue	*sem_pending;	/* pending operations to be processed */
 	struct sem_queue	**sem_pending_last; /* last pending operation */
-	struct sem_undo		*undo;		/* undo requests on this array */
+	struct list_head	list_id;	/* undo requests on this array */
 	unsigned long		sem_nsems;	/* no. of semaphores in array */
 };
 
@@ -118,8 +118,8 @@ struct sem_queue {
  * when the process exits.
  */
 struct sem_undo {
-	struct sem_undo *	proc_next;	/* next entry on this process */
-	struct sem_undo *	id_next;	/* next entry on this semaphore set */
+	struct list_head	list_proc;	/* per-process list: all undos from one process */
+	struct list_head	list_id;	/* per semaphore array list: all undos for one array */
 	int			semid;		/* semaphore set identifier */
 	short *			semadj;		/* array of adjustments, one per semaphore */
 };
@@ -128,9 +128,9 @@ struct sem_undo {
  * that may be shared among all a CLONE_SYSVSEM task group.
  */ 
 struct sem_undo_list {
-	atomic_t	refcnt;
-	spinlock_t	lock;
-	struct sem_undo	*proc_list;
+	atomic_t		refcnt;
+	spinlock_t		lock;
+	struct list_head	list_proc;
 };
 
 struct sysv_sem {
diff --git a/ipc/sem.c b/ipc/sem.c
index e9418df5ff3e..4f26c7157356 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -274,7 +274,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	sma->sem_base = (struct sem *) &sma[1];
 	/* sma->sem_pending = NULL; */
 	sma->sem_pending_last = &sma->sem_pending;
-	/* sma->undo = NULL; */
+	INIT_LIST_HEAD(&sma->list_id);
 	sma->sem_nsems = nsems;
 	sma->sem_ctime = get_seconds();
 	sem_unlock(sma);
@@ -536,7 +536,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 	 * (They will be freed without any further action in exit_sem()
 	 * or during the next semop.)
 	 */
-	for (un = sma->undo; un; un = un->id_next)
+	assert_spin_locked(&sma->sem_perm.lock);
+	list_for_each_entry(un, &sma->list_id, list_id)
 		un->semid = -1;
 
 	/* Wake up all pending processes and let them fail with EIDRM. */
@@ -763,9 +764,12 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 
 		for (i = 0; i < nsems; i++)
 			sma->sem_base[i].semval = sem_io[i];
-		for (un = sma->undo; un; un = un->id_next)
+
+		assert_spin_locked(&sma->sem_perm.lock);
+		list_for_each_entry(un, &sma->list_id, list_id) {
 			for (i = 0; i < nsems; i++)
 				un->semadj[i] = 0;
+		}
 		sma->sem_ctime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
 		update_queue(sma);
@@ -797,12 +801,15 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 	{
 		int val = arg.val;
 		struct sem_undo *un;
+
 		err = -ERANGE;
 		if (val > SEMVMX || val < 0)
 			goto out_unlock;
 
-		for (un = sma->undo; un; un = un->id_next)
+		assert_spin_locked(&sma->sem_perm.lock);
+		list_for_each_entry(un, &sma->list_id, list_id)
 			un->semadj[semnum] = 0;
+
 		curr->semval = val;
 		curr->sempid = task_tgid_vnr(current);
 		sma->sem_ctime = get_seconds();
@@ -952,6 +959,8 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
 			return -ENOMEM;
 		spin_lock_init(&undo_list->lock);
 		atomic_set(&undo_list->refcnt, 1);
+		INIT_LIST_HEAD(&undo_list->list_proc);
+
 		current->sysvsem.undo_list = undo_list;
 	}
 	*undo_listp = undo_list;
@@ -960,25 +969,30 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
 
 static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
 {
-	struct sem_undo **last, *un;
-
-	last = &ulp->proc_list;
-	un = *last;
-	while(un != NULL) {
-		if(un->semid==semid)
-			break;
-		if(un->semid==-1) {
-			*last=un->proc_next;
-			kfree(un);
-		} else {
-			last=&un->proc_next;
+	struct sem_undo *walk, *tmp;
+
+	assert_spin_locked(&ulp->lock);
+	list_for_each_entry_safe(walk, tmp, &ulp->list_proc, list_proc) {
+		if (walk->semid == semid)
+			return walk;
+		if (walk->semid == -1) {
+			list_del(&walk->list_proc);
+			kfree(walk);
 		}
-		un=*last;
 	}
-	return un;
+	return NULL;
 }
 
-static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid)
+/**
+ * find_alloc_undo - Lookup (and if not present create) undo array
+ * @ns: namespace
+ * @semid: semaphore array id
+ *
+ * The function looks up (and if not present creates) the undo structure.
+ * The size of the undo structure depends on the size of the semaphore
+ * array, thus the alloc path is not that straightforward.
+ */
+static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 {
 	struct sem_array *sma;
 	struct sem_undo_list *ulp;
@@ -997,6 +1011,7 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid)
 		goto out;
 
 	/* no undo structure around - allocate one. */
+	/* step 1: figure out the size of the semaphore array */
 	sma = sem_lock_check(ns, semid);
 	if (IS_ERR(sma))
 		return ERR_PTR(PTR_ERR(sma));
@@ -1004,15 +1019,19 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid)
 	nsems = sma->sem_nsems;
 	sem_getref_and_unlock(sma);
 
+	/* step 2: allocate new undo structure */
 	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
 	if (!new) {
 		sem_putref(sma);
 		return ERR_PTR(-ENOMEM);
 	}
-	new->semadj = (short *) &new[1];
-	new->semid = semid;
 
+	/* step 3: Acquire the lock on the undo list pointer */
 	spin_lock(&ulp->lock);
+
+	/* step 4: check for races: someone else allocated the undo struct,
+	 *         semaphore array was destroyed.
+	 */
 	un = lookup_undo(ulp, semid);
 	if (un) {
 		spin_unlock(&ulp->lock);
@@ -1028,13 +1047,17 @@ static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid)
 		un = ERR_PTR(-EIDRM);
 		goto out;
 	}
-	new->proc_next = ulp->proc_list;
-	ulp->proc_list = new;
-	new->id_next = sma->undo;
-	sma->undo = new;
+	/* step 5: initialize & link new undo structure */
+	new->semadj = (short *) &new[1];
+	new->semid = semid;
+	assert_spin_locked(&ulp->lock);
+	list_add(&new->list_proc, &ulp->list_proc);
+	assert_spin_locked(&sma->sem_perm.lock);
+	list_add(&new->list_id, &sma->list_id);
+
 	sem_unlock(sma);
-	un = new;
 	spin_unlock(&ulp->lock);
+	un = new;
 out:
 	return un;
 }
@@ -1090,9 +1113,8 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
 			alter = 1;
 	}
 
-retry_undos:
 	if (undos) {
-		un = find_undo(ns, semid);
+		un = find_alloc_undo(ns, semid);
 		if (IS_ERR(un)) {
 			error = PTR_ERR(un);
 			goto out_free;
@@ -1107,14 +1129,14 @@ retry_undos:
 	}
 
 	/*
-	 * semid identifiers are not unique - find_undo may have
+	 * semid identifiers are not unique - find_alloc_undo may have
 	 * allocated an undo structure, it was invalidated by an RMID
-	 * and now a new array with received the same id. Check and retry.
+	 * and now a new array with received the same id. Check and fail.
 	 */
-	if (un && un->semid == -1) {
-		sem_unlock(sma);
-		goto retry_undos;
-	}
+	error = -EIDRM;
+	if (un && un->semid == -1)
+		goto out_unlock_free;
+
 	error = -EFBIG;
 	if (max >= sma->sem_nsems)
 		goto out_unlock_free;
@@ -1243,56 +1265,44 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
  */
 void exit_sem(struct task_struct *tsk)
 {
-	struct sem_undo_list *undo_list;
-	struct sem_undo *u, **up;
-	struct ipc_namespace *ns;
+	struct sem_undo_list *ulp;
+	struct sem_undo *un, *tmp;
 
-	undo_list = tsk->sysvsem.undo_list;
-	if (!undo_list)
+	ulp = tsk->sysvsem.undo_list;
+	if (!ulp)
 		return;
 	tsk->sysvsem.undo_list = NULL;
 
-	if (!atomic_dec_and_test(&undo_list->refcnt))
+	if (!atomic_dec_and_test(&ulp->refcnt))
 		return;
 
-	ns = tsk->nsproxy->ipc_ns;
-	/* There's no need to hold the semundo list lock, as current
-         * is the last task exiting for this undo list.
-	 */
-	for (up = &undo_list->proc_list; (u = *up); *up = u->proc_next, kfree(u)) {
+	spin_lock(&ulp->lock);
+
+	list_for_each_entry_safe(un, tmp, &ulp->list_proc, list_proc) {
 		struct sem_array *sma;
-		int nsems, i;
-		struct sem_undo *un, **unp;
-		int semid;
-	       
-		semid = u->semid;
-
-		if(semid == -1)
-			continue;
-		sma = sem_lock(ns, semid);
+		int i;
+
+		if (un->semid == -1)
+			goto free;
+
+		sma = sem_lock(tsk->nsproxy->ipc_ns, un->semid);
 		if (IS_ERR(sma))
-			continue;
+			goto free;
 
-		if (u->semid == -1)
-			goto next_entry;
+		if (un->semid == -1)
+			goto unlock_free;
 
-		BUG_ON(sem_checkid(sma, u->semid));
+		BUG_ON(sem_checkid(sma, un->semid));
 
-		/* remove u from the sma->undo list */
-		for (unp = &sma->undo; (un = *unp); unp = &un->id_next) {
-			if (u == un)
-				goto found;
-		}
-		printk ("exit_sem undo list error id=%d\n", u->semid);
-		goto next_entry;
-found:
-		*unp = un->id_next;
-		/* perform adjustments registered in u */
-		nsems = sma->sem_nsems;
-		for (i = 0; i < nsems; i++) {
+		/* remove un from sma->list_id */
+		assert_spin_locked(&sma->sem_perm.lock);
+		list_del(&un->list_id);
+
+		/* perform adjustments registered in un */
+		for (i = 0; i < sma->sem_nsems; i++) {
 			struct sem * semaphore = &sma->sem_base[i];
-			if (u->semadj[i]) {
-				semaphore->semval += u->semadj[i];
+			if (un->semadj[i]) {
+				semaphore->semval += un->semadj[i];
 				/*
 				 * Range checks of the new semaphore value,
 				 * not defined by sus:
@@ -1316,10 +1326,15 @@ found:
 		sma->sem_otime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
 		update_queue(sma);
-next_entry:
+unlock_free:
 		sem_unlock(sma);
+free:
+		assert_spin_locked(&ulp->lock);
+		list_del(&un->list_proc);
+		kfree(un);
 	}
-	kfree(undo_list);
+	spin_unlock(&ulp->lock);
+	kfree(ulp);
 }
 
 #ifdef CONFIG_PROC_FS
-- 
cgit v1.2.3


From 2c0c29d414087f3b021059673c20a7088f5f1fff Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Fri, 25 Jul 2008 01:48:05 -0700
Subject: ipc/sem.c: remove unused entries from struct sem_queue

sem_queue.sma and sem_queue.id were never used, the attached patch removes
them.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Reviewed-by: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h | 2 --
 ipc/sem.c           | 2 --
 2 files changed, 4 deletions(-)

(limited to 'ipc')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 6a1af1b49a13..87756ef1198e 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -107,8 +107,6 @@ struct sem_queue {
 	struct sem_undo *	undo;	 /* undo structure */
 	int    			pid;	 /* process id of requesting process */
 	int    			status;	 /* completion status of operation */
-	struct sem_array *	sma;	 /* semaphore array for operations */
-	int			id;	 /* internal sem id */
 	struct sembuf *		sops;	 /* array of pending operations */
 	int			nsops;	 /* number of operations */
 	int			alter;   /* does the operation alter the array? */
diff --git a/ipc/sem.c b/ipc/sem.c
index 4f26c7157356..d5ce4000ca17 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1160,12 +1160,10 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
 	 * task into the pending queue and go to sleep.
 	 */
 		
-	queue.sma = sma;
 	queue.sops = sops;
 	queue.nsops = nsops;
 	queue.undo = un;
 	queue.pid = task_tgid_vnr(current);
-	queue.id = semid;
 	queue.alter = alter;
 	if (alter)
 		append_to_queue(sma ,&queue);
-- 
cgit v1.2.3


From a1193f8ec091cd8fd309cc2982abe4499f6f2b4d Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Fri, 25 Jul 2008 01:48:06 -0700
Subject: ipc/sem.c: convert sem_array.sem_pending to struct list_head

sem_array.sem_pending is a double linked list, the attached patch converts
it to struct list_head.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Reviewed-by: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h | 12 +++----
 ipc/sem.c           | 92 ++++++++++++++++++++---------------------------------
 2 files changed, 40 insertions(+), 64 deletions(-)

(limited to 'ipc')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index 87756ef1198e..d42599395d79 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -93,21 +93,19 @@ struct sem_array {
 	time_t			sem_otime;	/* last semop time */
 	time_t			sem_ctime;	/* last change time */
 	struct sem		*sem_base;	/* ptr to first semaphore in array */
-	struct sem_queue	*sem_pending;	/* pending operations to be processed */
-	struct sem_queue	**sem_pending_last; /* last pending operation */
+	struct list_head	sem_pending;	/* pending operations to be processed */
 	struct list_head	list_id;	/* undo requests on this array */
 	unsigned long		sem_nsems;	/* no. of semaphores in array */
 };
 
 /* One queue for each sleeping process in the system. */
 struct sem_queue {
-	struct sem_queue *	next;	 /* next entry in the queue */
-	struct sem_queue **	prev;	 /* previous entry in the queue, *(q->prev) == q */
-	struct task_struct*	sleeper; /* this process */
-	struct sem_undo *	undo;	 /* undo structure */
+	struct list_head	list;	 /* queue of pending operations */
+	struct task_struct	*sleeper; /* this process */
+	struct sem_undo		*undo;	 /* undo structure */
 	int    			pid;	 /* process id of requesting process */
 	int    			status;	 /* completion status of operation */
-	struct sembuf *		sops;	 /* array of pending operations */
+	struct sembuf		*sops;	 /* array of pending operations */
 	int			nsops;	 /* number of operations */
 	int			alter;   /* does the operation alter the array? */
 };
diff --git a/ipc/sem.c b/ipc/sem.c
index d5ce4000ca17..3ca232736b31 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -272,8 +272,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 	ns->used_sems += nsems;
 
 	sma->sem_base = (struct sem *) &sma[1];
-	/* sma->sem_pending = NULL; */
-	sma->sem_pending_last = &sma->sem_pending;
+	INIT_LIST_HEAD(&sma->sem_pending);
 	INIT_LIST_HEAD(&sma->list_id);
 	sma->sem_nsems = nsems;
 	sma->sem_ctime = get_seconds();
@@ -331,38 +330,6 @@ asmlinkage long sys_semget(key_t key, int nsems, int semflg)
 	return ipcget(ns, &sem_ids(ns), &sem_ops, &sem_params);
 }
 
-/* Manage the doubly linked list sma->sem_pending as a FIFO:
- * insert new queue elements at the tail sma->sem_pending_last.
- */
-static inline void append_to_queue (struct sem_array * sma,
-				    struct sem_queue * q)
-{
-	*(q->prev = sma->sem_pending_last) = q;
-	*(sma->sem_pending_last = &q->next) = NULL;
-}
-
-static inline void prepend_to_queue (struct sem_array * sma,
-				     struct sem_queue * q)
-{
-	q->next = sma->sem_pending;
-	*(q->prev = &sma->sem_pending) = q;
-	if (q->next)
-		q->next->prev = &q->next;
-	else /* sma->sem_pending_last == &sma->sem_pending */
-		sma->sem_pending_last = &q->next;
-}
-
-static inline void remove_from_queue (struct sem_array * sma,
-				      struct sem_queue * q)
-{
-	*(q->prev) = q->next;
-	if (q->next)
-		q->next->prev = q->prev;
-	else /* sma->sem_pending_last == &q->next */
-		sma->sem_pending_last = q->prev;
-	q->prev = NULL; /* mark as removed */
-}
-
 /*
  * Determine whether a sequence of semaphore operations would succeed
  * all at once. Return 0 if yes, 1 if need to sleep, else return error code.
@@ -438,16 +405,15 @@ static void update_queue (struct sem_array * sma)
 	int error;
 	struct sem_queue * q;
 
-	q = sma->sem_pending;
-	while(q) {
+	q = list_entry(sma->sem_pending.next, struct sem_queue, list);
+	while (&q->list != &sma->sem_pending) {
 		error = try_atomic_semop(sma, q->sops, q->nsops,
 					 q->undo, q->pid);
 
 		/* Does q->sleeper still need to sleep? */
 		if (error <= 0) {
 			struct sem_queue *n;
-			remove_from_queue(sma,q);
-			q->status = IN_WAKEUP;
+
 			/*
 			 * Continue scanning. The next operation
 			 * that must be checked depends on the type of the
@@ -458,11 +424,26 @@ static void update_queue (struct sem_array * sma)
 			 *   for semaphore values to become 0.
 			 * - if the operation didn't modify the array,
 			 *   then just continue.
+			 * The order of list_del() and reading ->next
+			 * is crucial: In the former case, the list_del()
+			 * must be done first [because we might be the
+			 * first entry in ->sem_pending], in the latter
+			 * case the list_del() must be done last
+			 * [because the list is invalid after the list_del()]
 			 */
-			if (q->alter)
-				n = sma->sem_pending;
-			else
-				n = q->next;
+			if (q->alter) {
+				list_del(&q->list);
+				n = list_entry(sma->sem_pending.next,
+						struct sem_queue, list);
+			} else {
+				n = list_entry(q->list.next, struct sem_queue,
+						list);
+				list_del(&q->list);
+			}
+
+			/* wake up the waiting thread */
+			q->status = IN_WAKEUP;
+
 			wake_up_process(q->sleeper);
 			/* hands-off: q will disappear immediately after
 			 * writing q->status.
@@ -471,7 +452,7 @@ static void update_queue (struct sem_array * sma)
 			q->status = error;
 			q = n;
 		} else {
-			q = q->next;
+			q = list_entry(q->list.next, struct sem_queue, list);
 		}
 	}
 }
@@ -491,7 +472,7 @@ static int count_semncnt (struct sem_array * sma, ushort semnum)
 	struct sem_queue * q;
 
 	semncnt = 0;
-	for (q = sma->sem_pending; q; q = q->next) {
+	list_for_each_entry(q, &sma->sem_pending, list) {
 		struct sembuf * sops = q->sops;
 		int nsops = q->nsops;
 		int i;
@@ -503,13 +484,14 @@ static int count_semncnt (struct sem_array * sma, ushort semnum)
 	}
 	return semncnt;
 }
+
 static int count_semzcnt (struct sem_array * sma, ushort semnum)
 {
 	int semzcnt;
 	struct sem_queue * q;
 
 	semzcnt = 0;
-	for (q = sma->sem_pending; q; q = q->next) {
+	list_for_each_entry(q, &sma->sem_pending, list) {
 		struct sembuf * sops = q->sops;
 		int nsops = q->nsops;
 		int i;
@@ -529,7 +511,7 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum)
 static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
 	struct sem_undo *un;
-	struct sem_queue *q;
+	struct sem_queue *q, *t;
 	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
 
 	/* Invalidate the existing undo structures for this semaphore set.
@@ -541,17 +523,14 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 		un->semid = -1;
 
 	/* Wake up all pending processes and let them fail with EIDRM. */
-	q = sma->sem_pending;
-	while(q) {
-		struct sem_queue *n;
-		/* lazy remove_from_queue: we are killing the whole queue */
-		q->prev = NULL;
-		n = q->next;
+
+	list_for_each_entry_safe(q, t, &sma->sem_pending, list) {
+		list_del(&q->list);
+
 		q->status = IN_WAKEUP;
 		wake_up_process(q->sleeper); /* doesn't sleep */
 		smp_wmb();
 		q->status = -EIDRM;	/* hands-off q */
-		q = n;
 	}
 
 	/* Remove the semaphore set from the IDR */
@@ -1166,9 +1145,9 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
 	queue.pid = task_tgid_vnr(current);
 	queue.alter = alter;
 	if (alter)
-		append_to_queue(sma ,&queue);
+		list_add_tail(&queue.list, &sma->sem_pending);
 	else
-		prepend_to_queue(sma ,&queue);
+		list_add(&queue.list, &sma->sem_pending);
 
 	queue.status = -EINTR;
 	queue.sleeper = current;
@@ -1194,7 +1173,6 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
 
 	sma = sem_lock(ns, semid);
 	if (IS_ERR(sma)) {
-		BUG_ON(queue.prev != NULL);
 		error = -EIDRM;
 		goto out_free;
 	}
@@ -1212,7 +1190,7 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
 	 */
 	if (timeout && jiffies_left == 0)
 		error = -EAGAIN;
-	remove_from_queue(sma,&queue);
+	list_del(&queue.list);
 	goto out_unlock_free;
 
 out_unlock_free:
-- 
cgit v1.2.3


From 380af1b33b3ff92df5cda96329b58f5d1b6b5a53 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Fri, 25 Jul 2008 01:48:06 -0700
Subject: ipc/sem.c: rewrite undo list locking

The attached patch:
- reverses the locking order of ulp->lock and sem_lock:
  Previously, it was first ulp->lock, then inside sem_lock.
  Now it's the other way around.
- converts the undo structure to rcu.

Benefits:
- With the old locking order, IPC_RMID could not kfree the undo structures.
  The stale entries remained in the linked lists and were released later.
- The patch fixes a a race in semtimedop(): if both IPC_RMID and a semget() that
  recreates exactly the same id happen between find_alloc_undo() and sem_lock,
  then semtimedop() would access already kfree'd memory.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Reviewed-by: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sem.h |   6 ++-
 ipc/sem.c           | 147 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 98 insertions(+), 55 deletions(-)

(limited to 'ipc')

diff --git a/include/linux/sem.h b/include/linux/sem.h
index d42599395d79..1b191c176bcd 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -78,6 +78,7 @@ struct  seminfo {
 
 #ifdef __KERNEL__
 #include <asm/atomic.h>
+#include <linux/rcupdate.h>
 
 struct task_struct;
 
@@ -114,7 +115,10 @@ struct sem_queue {
  * when the process exits.
  */
 struct sem_undo {
-	struct list_head	list_proc;	/* per-process list: all undos from one process */
+	struct list_head	list_proc;	/* per-process list: all undos from one process. */
+						/* rcu protected */
+	struct rcu_head		rcu;		/* rcu struct for sem_undo() */
+	struct sem_undo_list	*ulp;		/* sem_undo_list for the process */
 	struct list_head	list_id;	/* per semaphore array list: all undos for one array */
 	int			semid;		/* semaphore set identifier */
 	short *			semadj;		/* array of adjustments, one per semaphore */
diff --git a/ipc/sem.c b/ipc/sem.c
index 3ca232736b31..bf1bc36cb7ee 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -504,27 +504,35 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum)
 	return semzcnt;
 }
 
+void free_un(struct rcu_head *head)
+{
+	struct sem_undo *un = container_of(head, struct sem_undo, rcu);
+	kfree(un);
+}
+
 /* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked
  * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex
  * remains locked on exit.
  */
 static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
-	struct sem_undo *un;
-	struct sem_queue *q, *t;
+	struct sem_undo *un, *tu;
+	struct sem_queue *q, *tq;
 	struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
 
-	/* Invalidate the existing undo structures for this semaphore set.
-	 * (They will be freed without any further action in exit_sem()
-	 * or during the next semop.)
-	 */
+	/* Free the existing undo structures for this semaphore set.  */
 	assert_spin_locked(&sma->sem_perm.lock);
-	list_for_each_entry(un, &sma->list_id, list_id)
+	list_for_each_entry_safe(un, tu, &sma->list_id, list_id) {
+		list_del(&un->list_id);
+		spin_lock(&un->ulp->lock);
 		un->semid = -1;
+		list_del_rcu(&un->list_proc);
+		spin_unlock(&un->ulp->lock);
+		call_rcu(&un->rcu, free_un);
+	}
 
 	/* Wake up all pending processes and let them fail with EIDRM. */
-
-	list_for_each_entry_safe(q, t, &sma->sem_pending, list) {
+	list_for_each_entry_safe(q, tq, &sma->sem_pending, list) {
 		list_del(&q->list);
 
 		q->status = IN_WAKEUP;
@@ -948,16 +956,11 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
 
 static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
 {
-	struct sem_undo *walk, *tmp;
+	struct sem_undo *walk;
 
-	assert_spin_locked(&ulp->lock);
-	list_for_each_entry_safe(walk, tmp, &ulp->list_proc, list_proc) {
+	list_for_each_entry_rcu(walk, &ulp->list_proc, list_proc) {
 		if (walk->semid == semid)
 			return walk;
-		if (walk->semid == -1) {
-			list_del(&walk->list_proc);
-			kfree(walk);
-		}
 	}
 	return NULL;
 }
@@ -970,6 +973,8 @@ static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid)
  * The function looks up (and if not present creates) the undo structure.
  * The size of the undo structure depends on the size of the semaphore
  * array, thus the alloc path is not that straightforward.
+ * Lifetime-rules: sem_undo is rcu-protected, on success, the function
+ * performs a rcu_read_lock().
  */
 static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 {
@@ -983,11 +988,13 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 	if (error)
 		return ERR_PTR(error);
 
+	rcu_read_lock();
 	spin_lock(&ulp->lock);
 	un = lookup_undo(ulp, semid);
 	spin_unlock(&ulp->lock);
 	if (likely(un!=NULL))
 		goto out;
+	rcu_read_unlock();
 
 	/* no undo structure around - allocate one. */
 	/* step 1: figure out the size of the semaphore array */
@@ -1005,38 +1012,38 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 		return ERR_PTR(-ENOMEM);
 	}
 
-	/* step 3: Acquire the lock on the undo list pointer */
-	spin_lock(&ulp->lock);
-
-	/* step 4: check for races: someone else allocated the undo struct,
-	 *         semaphore array was destroyed.
-	 */
-	un = lookup_undo(ulp, semid);
-	if (un) {
-		spin_unlock(&ulp->lock);
-		kfree(new);
-		sem_putref(sma);
-		goto out;
-	}
+	/* step 3: Acquire the lock on semaphore array */
 	sem_lock_and_putref(sma);
 	if (sma->sem_perm.deleted) {
 		sem_unlock(sma);
-		spin_unlock(&ulp->lock);
 		kfree(new);
 		un = ERR_PTR(-EIDRM);
 		goto out;
 	}
+	spin_lock(&ulp->lock);
+
+	/*
+	 * step 4: check for races: did someone else allocate the undo struct?
+	 */
+	un = lookup_undo(ulp, semid);
+	if (un) {
+		kfree(new);
+		goto success;
+	}
 	/* step 5: initialize & link new undo structure */
 	new->semadj = (short *) &new[1];
+	new->ulp = ulp;
 	new->semid = semid;
 	assert_spin_locked(&ulp->lock);
-	list_add(&new->list_proc, &ulp->list_proc);
+	list_add_rcu(&new->list_proc, &ulp->list_proc);
 	assert_spin_locked(&sma->sem_perm.lock);
 	list_add(&new->list_id, &sma->list_id);
+	un = new;
 
-	sem_unlock(sma);
+success:
 	spin_unlock(&ulp->lock);
-	un = new;
+	rcu_read_lock();
+	sem_unlock(sma);
 out:
 	return un;
 }
@@ -1103,6 +1110,8 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
 
 	sma = sem_lock_check(ns, semid);
 	if (IS_ERR(sma)) {
+		if (un)
+			rcu_read_unlock();
 		error = PTR_ERR(sma);
 		goto out_free;
 	}
@@ -1111,10 +1120,26 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops,
 	 * semid identifiers are not unique - find_alloc_undo may have
 	 * allocated an undo structure, it was invalidated by an RMID
 	 * and now a new array with received the same id. Check and fail.
+	 * This case can be detected checking un->semid. The existance of
+	 * "un" itself is guaranteed by rcu.
 	 */
 	error = -EIDRM;
-	if (un && un->semid == -1)
-		goto out_unlock_free;
+	if (un) {
+		if (un->semid == -1) {
+			rcu_read_unlock();
+			goto out_unlock_free;
+		} else {
+			/*
+			 * rcu lock can be released, "un" cannot disappear:
+			 * - sem_lock is acquired, thus IPC_RMID is
+			 *   impossible.
+			 * - exit_sem is impossible, it always operates on
+			 *   current (or a dead task).
+			 */
+
+			rcu_read_unlock();
+		}
+	}
 
 	error = -EFBIG;
 	if (max >= sma->sem_nsems)
@@ -1242,7 +1267,6 @@ int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
 void exit_sem(struct task_struct *tsk)
 {
 	struct sem_undo_list *ulp;
-	struct sem_undo *un, *tmp;
 
 	ulp = tsk->sysvsem.undo_list;
 	if (!ulp)
@@ -1252,28 +1276,47 @@ void exit_sem(struct task_struct *tsk)
 	if (!atomic_dec_and_test(&ulp->refcnt))
 		return;
 
-	spin_lock(&ulp->lock);
-
-	list_for_each_entry_safe(un, tmp, &ulp->list_proc, list_proc) {
+	for (;;) {
 		struct sem_array *sma;
+		struct sem_undo *un;
+		int semid;
 		int i;
 
-		if (un->semid == -1)
-			goto free;
+		rcu_read_lock();
+		un = list_entry(rcu_dereference(ulp->list_proc.next),
+					struct sem_undo, list_proc);
+		if (&un->list_proc == &ulp->list_proc)
+			semid = -1;
+		 else
+			semid = un->semid;
+		rcu_read_unlock();
 
-		sma = sem_lock(tsk->nsproxy->ipc_ns, un->semid);
-		if (IS_ERR(sma))
-			goto free;
+		if (semid == -1)
+			break;
 
-		if (un->semid == -1)
-			goto unlock_free;
+		sma = sem_lock_check(tsk->nsproxy->ipc_ns, un->semid);
 
-		BUG_ON(sem_checkid(sma, un->semid));
+		/* exit_sem raced with IPC_RMID, nothing to do */
+		if (IS_ERR(sma))
+			continue;
 
-		/* remove un from sma->list_id */
+		un = lookup_undo(ulp, semid);
+		if (un == NULL) {
+			/* exit_sem raced with IPC_RMID+semget() that created
+			 * exactly the same semid. Nothing to do.
+			 */
+			sem_unlock(sma);
+			continue;
+		}
+
+		/* remove un from the linked lists */
 		assert_spin_locked(&sma->sem_perm.lock);
 		list_del(&un->list_id);
 
+		spin_lock(&ulp->lock);
+		list_del_rcu(&un->list_proc);
+		spin_unlock(&ulp->lock);
+
 		/* perform adjustments registered in un */
 		for (i = 0; i < sma->sem_nsems; i++) {
 			struct sem * semaphore = &sma->sem_base[i];
@@ -1302,14 +1345,10 @@ void exit_sem(struct task_struct *tsk)
 		sma->sem_otime = get_seconds();
 		/* maybe some queued-up processes were waiting for this */
 		update_queue(sma);
-unlock_free:
 		sem_unlock(sma);
-free:
-		assert_spin_locked(&ulp->lock);
-		list_del(&un->list_proc);
-		kfree(un);
+
+		call_rcu(&un->rcu, free_un);
 	}
-	spin_unlock(&ulp->lock);
 	kfree(ulp);
 }
 
-- 
cgit v1.2.3


From f1a43f93f0f3bab418800eaccb9e2e3b5427e173 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 25 Jul 2008 01:48:07 -0700
Subject: ipc: use simple_read_from_buffer()

Also this patch kills unneccesary trailing NULL character.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Pierre Peiffer <peifferp@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/mqueue.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

(limited to 'ipc')

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 3e84b958186b..1fdc2eb2f6d8 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -314,15 +314,11 @@ static int mqueue_unlink(struct inode *dir, struct dentry *dentry)
 *	through std routines)
 */
 static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
-				size_t count, loff_t * off)
+				size_t count, loff_t *off)
 {
 	struct mqueue_inode_info *info = MQUEUE_I(filp->f_path.dentry->d_inode);
 	char buffer[FILENT_SIZE];
-	size_t slen;
-	loff_t o;
-
-	if (!count)
-		return 0;
+	ssize_t ret;
 
 	spin_lock(&info->lock);
 	snprintf(buffer, sizeof(buffer),
@@ -335,21 +331,14 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 			pid_vnr(info->notify_owner));
 	spin_unlock(&info->lock);
 	buffer[sizeof(buffer)-1] = '\0';
-	slen = strlen(buffer)+1;
-
-	o = *off;
-	if (o > slen)
-		return 0;
-
-	if (o + count > slen)
-		count = slen - o;
 
-	if (copy_to_user(u_data, buffer + o, count))
-		return -EFAULT;
+	ret = simple_read_from_buffer(u_data, count, off, buffer,
+				strlen(buffer));
+	if (ret <= 0)
+		return ret;
 
-	*off = o + count;
 	filp->f_path.dentry->d_inode->i_atime = filp->f_path.dentry->d_inode->i_ctime = CURRENT_TIME;
-	return count;
+	return ret;
 }
 
 static int mqueue_flush_file(struct file *filp, fl_owner_t id)
-- 
cgit v1.2.3


From 9eefe520c814f6f62c5d36a2ddcd3fb99dfdb30e Mon Sep 17 00:00:00 2001
From: Nadia Derbey <Nadia.Derbey@bull.net>
Date: Fri, 25 Jul 2008 01:48:08 -0700
Subject: ipc: do not use a negative value to re-enable msgmni automatic
 recomputing

This patch proposes an alternative to the "magical
positive-versus-negative number trick" Andrew complained about last week
in http://lkml.org/lkml/2008/6/24/418.

This had been introduced with the patches that scale msgmni to the amount
of lowmem.  With these patches, msgmni has a registered notification
routine that recomputes msgmni value upon memory add/remove or ipc
namespace creation/ removal.

When msgmni is changed from user space (i.e.  value written to the proc
file), that notification routine is unregistered, and the way to make it
registered back is to write a negative value into the proc file.  This is
the "magical positive-versus-negative number trick".

To fix this, a new proc file is introduced: /proc/sys/kernel/auto_msgmni.
This file acts as ON/OFF for msgmni automatic recomputing.

With this patch, the process is the following:
1) kernel boots in "automatic recomputing mode"
   /proc/sys/kernel/msgmni contains the value that has been computed (depends
                           on lowmem)
   /proc/sys/kernel/automatic_msgmni contains "1"

2) echo <val> > /proc/sys/kernel/msgmni
   . sets msg_ctlmni to <val>
   . de-activates automatic recomputing (i.e. if, say, some memory is added
     msgmni won't be recomputed anymore)
   . /proc/sys/kernel/automatic_msgmni now contains "0"

3) echo "0" > /proc/sys/kernel/automatic_msgmni
   . de-activates msgmni automatic recomputing
     this has the same effect as 2) except that msg_ctlmni's value stays
     blocked at its current value)

3) echo "1" > /proc/sys/kernel/automatic_msgmni
   . recomputes msgmni's value based on the current available memory size
     and number of ipc namespaces
   . re-activates automatic recomputing for msgmni.

Signed-off-by: Nadia Derbey <Nadia.Derbey@bull.net>
Cc: Solofo Ramangalahy <Solofo.Ramangalahy@bull.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  3 +-
 ipc/ipc_sysctl.c              | 72 +++++++++++++++++++++++++++++++++++--------
 ipc/ipcns_notifier.c          | 20 +++++++++---
 3 files changed, 76 insertions(+), 19 deletions(-)

(limited to 'ipc')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index ea6c18a8b0d4..ea330f9e7100 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -36,6 +36,7 @@ struct ipc_namespace {
 	int		msg_ctlmni;
 	atomic_t	msg_bytes;
 	atomic_t	msg_hdrs;
+	int		auto_msgmni;
 
 	size_t		shm_ctlmax;
 	size_t		shm_ctlall;
@@ -53,7 +54,7 @@ extern atomic_t nr_ipc_ns;
 
 extern int register_ipcns_notifier(struct ipc_namespace *);
 extern int cond_register_ipcns_notifier(struct ipc_namespace *);
-extern int unregister_ipcns_notifier(struct ipc_namespace *);
+extern void unregister_ipcns_notifier(struct ipc_namespace *);
 extern int ipcns_notify(unsigned long);
 
 #else /* CONFIG_SYSVIPC */
diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index d3497465cc0a..69bc85978ba0 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -27,15 +27,17 @@ static void *get_ipc(ctl_table *table)
 }
 
 /*
- * Routine that is called when a tunable has successfully been changed by
- * hand and it has a callback routine registered on the ipc namespace notifier
- * chain: we don't want such tunables to be recomputed anymore upon memory
- * add/remove or ipc namespace creation/removal.
- * They can come back to a recomputable state by being set to a <0 value.
+ * Routine that is called when the file "auto_msgmni" has successfully been
+ * written.
+ * Two values are allowed:
+ * 0: unregister msgmni's callback routine from the ipc namespace notifier
+ *    chain. This means that msgmni won't be recomputed anymore upon memory
+ *    add/remove or ipc namespace creation/removal.
+ * 1: register back the callback routine.
  */
-static void tunable_set_callback(int val)
+static void ipc_auto_callback(int val)
 {
-	if (val >= 0)
+	if (!val)
 		unregister_ipcns_notifier(current->nsproxy->ipc_ns);
 	else {
 		/*
@@ -71,7 +73,12 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write,
 	rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos);
 
 	if (write && !rc && lenp_bef == *lenp)
-		tunable_set_callback(*((int *)(ipc_table.data)));
+		/*
+		 * Tunable has successfully been changed by hand. Disable its
+		 * automatic adjustment. This simply requires unregistering
+		 * the notifiers that trigger recalculation.
+		 */
+		unregister_ipcns_notifier(current->nsproxy->ipc_ns);
 
 	return rc;
 }
@@ -87,10 +94,39 @@ static int proc_ipc_doulongvec_minmax(ctl_table *table, int write,
 					lenp, ppos);
 }
 
+static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write,
+	struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table ipc_table;
+	size_t lenp_bef = *lenp;
+	int oldval;
+	int rc;
+
+	memcpy(&ipc_table, table, sizeof(ipc_table));
+	ipc_table.data = get_ipc(table);
+	oldval = *((int *)(ipc_table.data));
+
+	rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos);
+
+	if (write && !rc && lenp_bef == *lenp) {
+		int newval = *((int *)(ipc_table.data));
+		/*
+		 * The file "auto_msgmni" has correctly been set.
+		 * React by (un)registering the corresponding tunable, if the
+		 * value has changed.
+		 */
+		if (newval != oldval)
+			ipc_auto_callback(newval);
+	}
+
+	return rc;
+}
+
 #else
 #define proc_ipc_doulongvec_minmax NULL
 #define proc_ipc_dointvec	   NULL
 #define proc_ipc_callback_dointvec NULL
+#define proc_ipcauto_dointvec_minmax NULL
 #endif
 
 #ifdef CONFIG_SYSCTL_SYSCALL
@@ -142,14 +178,11 @@ static int sysctl_ipc_registered_data(ctl_table *table, int __user *name,
 	rc = sysctl_ipc_data(table, name, nlen, oldval, oldlenp, newval,
 		newlen);
 
-	if (newval && newlen && rc > 0) {
+	if (newval && newlen && rc > 0)
 		/*
 		 * Tunable has successfully been changed from userland
 		 */
-		int *data = get_ipc(table);
-
-		tunable_set_callback(*data);
-	}
+		unregister_ipcns_notifier(current->nsproxy->ipc_ns);
 
 	return rc;
 }
@@ -158,6 +191,9 @@ static int sysctl_ipc_registered_data(ctl_table *table, int __user *name,
 #define sysctl_ipc_registered_data NULL
 #endif
 
+static int zero;
+static int one = 1;
+
 static struct ctl_table ipc_kern_table[] = {
 	{
 		.ctl_name	= KERN_SHMMAX,
@@ -222,6 +258,16 @@ static struct ctl_table ipc_kern_table[] = {
 		.proc_handler	= proc_ipc_dointvec,
 		.strategy	= sysctl_ipc_data,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "auto_msgmni",
+		.data		= &init_ipc_ns.auto_msgmni,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_ipcauto_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{}
 };
 
diff --git a/ipc/ipcns_notifier.c b/ipc/ipcns_notifier.c
index 70ff09183f7b..b9b31a4f77e1 100644
--- a/ipc/ipcns_notifier.c
+++ b/ipc/ipcns_notifier.c
@@ -55,25 +55,35 @@ static int ipcns_callback(struct notifier_block *self,
 
 int register_ipcns_notifier(struct ipc_namespace *ns)
 {
+	int rc;
+
 	memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb));
 	ns->ipcns_nb.notifier_call = ipcns_callback;
 	ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI;
-	return blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb);
+	rc = blocking_notifier_chain_register(&ipcns_chain, &ns->ipcns_nb);
+	if (!rc)
+		ns->auto_msgmni = 1;
+	return rc;
 }
 
 int cond_register_ipcns_notifier(struct ipc_namespace *ns)
 {
+	int rc;
+
 	memset(&ns->ipcns_nb, 0, sizeof(ns->ipcns_nb));
 	ns->ipcns_nb.notifier_call = ipcns_callback;
 	ns->ipcns_nb.priority = IPCNS_CALLBACK_PRI;
-	return blocking_notifier_chain_cond_register(&ipcns_chain,
+	rc = blocking_notifier_chain_cond_register(&ipcns_chain,
 							&ns->ipcns_nb);
+	if (!rc)
+		ns->auto_msgmni = 1;
+	return rc;
 }
 
-int unregister_ipcns_notifier(struct ipc_namespace *ns)
+void unregister_ipcns_notifier(struct ipc_namespace *ns)
 {
-	return blocking_notifier_chain_unregister(&ipcns_chain,
-						&ns->ipcns_nb);
+	blocking_notifier_chain_unregister(&ipcns_chain, &ns->ipcns_nb);
+	ns->auto_msgmni = 0;
 }
 
 int ipcns_notify(unsigned long val)
-- 
cgit v1.2.3


From 51cc50685a4275c6a02653670af9f108a64e01cf Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Fri, 25 Jul 2008 19:45:34 -0700
Subject: SL*B: drop kmem cache argument from constructor

Kmem cache passed to constructor is only needed for constructors that are
themselves multiplexeres.  Nobody uses this "feature", nor does anybody uses
passed kmem cache in non-trivial way, so pass only pointer to object.

Non-trivial places are:
	arch/powerpc/mm/init_64.c
	arch/powerpc/mm/hugetlbpage.c

This is flag day, yes.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Jon Tollefson <kniht@linux.vnet.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Matt Mackall <mpm@selenic.com>
[akpm@linux-foundation.org: fix arch/powerpc/mm/hugetlbpage.c]
[akpm@linux-foundation.org: fix mm/slab.c]
[akpm@linux-foundation.org: fix ubifs]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/plat-s3c24xx/dma.c               |  2 +-
 arch/powerpc/kernel/rtas_flash.c          |  2 +-
 arch/powerpc/mm/hugetlbpage.c             |  9 ++-------
 arch/powerpc/mm/init_64.c                 | 24 +++++++++---------------
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 arch/sh/mm/pmb.c                          |  2 +-
 arch/xtensa/mm/init.c                     |  2 +-
 drivers/usb/mon/mon_text.c                |  4 ++--
 fs/adfs/super.c                           |  2 +-
 fs/affs/super.c                           |  2 +-
 fs/afs/super.c                            |  4 ++--
 fs/befs/linuxvfs.c                        |  2 +-
 fs/bfs/inode.c                            |  2 +-
 fs/block_dev.c                            |  2 +-
 fs/buffer.c                               |  2 +-
 fs/cifs/cifsfs.c                          |  2 +-
 fs/coda/inode.c                           |  2 +-
 fs/ecryptfs/main.c                        |  4 ++--
 fs/efs/super.c                            |  2 +-
 fs/ext2/super.c                           |  2 +-
 fs/ext3/super.c                           |  2 +-
 fs/ext4/super.c                           |  2 +-
 fs/fat/cache.c                            |  2 +-
 fs/fat/inode.c                            |  2 +-
 fs/fuse/inode.c                           |  2 +-
 fs/gfs2/main.c                            |  4 ++--
 fs/hfs/super.c                            |  2 +-
 fs/hfsplus/super.c                        |  2 +-
 fs/hpfs/super.c                           |  2 +-
 fs/hugetlbfs/inode.c                      |  2 +-
 fs/inode.c                                |  2 +-
 fs/isofs/inode.c                          |  2 +-
 fs/jffs2/super.c                          |  2 +-
 fs/jfs/jfs_metapage.c                     |  2 +-
 fs/jfs/super.c                            |  2 +-
 fs/locks.c                                |  2 +-
 fs/minix/inode.c                          |  2 +-
 fs/ncpfs/inode.c                          |  2 +-
 fs/nfs/inode.c                            |  2 +-
 fs/ntfs/super.c                           |  2 +-
 fs/ocfs2/dlm/dlmfs.c                      |  3 +--
 fs/ocfs2/super.c                          |  2 +-
 fs/openpromfs/inode.c                     |  2 +-
 fs/proc/inode.c                           |  2 +-
 fs/qnx4/inode.c                           |  2 +-
 fs/reiserfs/super.c                       |  2 +-
 fs/romfs/inode.c                          |  2 +-
 fs/smbfs/inode.c                          |  2 +-
 fs/sysv/inode.c                           |  2 +-
 fs/ubifs/super.c                          |  2 +-
 fs/udf/super.c                            |  2 +-
 fs/ufs/super.c                            |  2 +-
 fs/xfs/linux-2.6/kmem.h                   |  2 +-
 fs/xfs/linux-2.6/xfs_super.c              |  1 -
 include/linux/slab.h                      |  2 +-
 include/linux/slub_def.h                  |  2 +-
 ipc/mqueue.c                              |  2 +-
 kernel/fork.c                             |  2 +-
 lib/idr.c                                 |  2 +-
 lib/radix-tree.c                          |  2 +-
 mm/rmap.c                                 |  2 +-
 mm/shmem.c                                |  2 +-
 mm/slab.c                                 | 11 +++++------
 mm/slob.c                                 |  7 +++----
 mm/slub.c                                 | 13 ++++++-------
 net/socket.c                              |  2 +-
 net/sunrpc/rpc_pipe.c                     |  2 +-
 67 files changed, 90 insertions(+), 106 deletions(-)

(limited to 'ipc')

diff --git a/arch/arm/plat-s3c24xx/dma.c b/arch/arm/plat-s3c24xx/dma.c
index 60f162dc4fad..8c5e656d5d8c 100644
--- a/arch/arm/plat-s3c24xx/dma.c
+++ b/arch/arm/plat-s3c24xx/dma.c
@@ -1304,7 +1304,7 @@ struct sysdev_class dma_sysclass = {
 
 /* kmem cache implementation */
 
-static void s3c2410_dma_cache_ctor(struct kmem_cache *c, void *p)
+static void s3c2410_dma_cache_ctor(void *p)
 {
 	memset(p, 0, sizeof(struct s3c2410_dma_buf));
 }
diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c
index 09ded5c424a9..149cb112cd1a 100644
--- a/arch/powerpc/kernel/rtas_flash.c
+++ b/arch/powerpc/kernel/rtas_flash.c
@@ -286,7 +286,7 @@ static ssize_t rtas_flash_read(struct file *file, char __user *buf,
 }
 
 /* constructor for flash_block_cache */
-void rtas_block_ctor(struct kmem_cache *cache, void *ptr)
+void rtas_block_ctor(void *ptr)
 {
 	memset(ptr, 0, RTAS_BLK_SIZE);
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fb42c4dd3217..ed0aab0208a6 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -113,7 +113,7 @@ static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr,
 static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 			   unsigned long address, unsigned int psize)
 {
-	pte_t *new = kmem_cache_alloc(huge_pgtable_cache(psize),
+	pte_t *new = kmem_cache_zalloc(huge_pgtable_cache(psize),
 				      GFP_KERNEL|__GFP_REPEAT);
 
 	if (! new)
@@ -730,11 +730,6 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
-{
-	memset(addr, 0, kmem_cache_size(cache));
-}
-
 static int __init hugetlbpage_init(void)
 {
 	unsigned int psize;
@@ -756,7 +751,7 @@ static int __init hugetlbpage_init(void)
 						HUGEPTE_TABLE_SIZE(psize),
 						HUGEPTE_TABLE_SIZE(psize),
 						0,
-						zero_ctor);
+						NULL);
 			if (!huge_pgtable_cache(psize))
 				panic("hugetlbpage_init(): could not create %s"\
 				      "\n", HUGEPTE_CACHE_NAME(psize));
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a41bc5aa2043..4f7df85129d8 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -136,9 +136,14 @@ static int __init setup_kcore(void)
 module_init(setup_kcore);
 #endif
 
-static void zero_ctor(struct kmem_cache *cache, void *addr)
+static void pgd_ctor(void *addr)
 {
-	memset(addr, 0, kmem_cache_size(cache));
+	memset(addr, 0, PGD_TABLE_SIZE);
+}
+
+static void pmd_ctor(void *addr)
+{
+	memset(addr, 0, PMD_TABLE_SIZE);
 }
 
 static const unsigned int pgtable_cache_size[2] = {
@@ -163,19 +168,8 @@ struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
 
 void pgtable_cache_init(void)
 {
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(pgtable_cache_size); i++) {
-		int size = pgtable_cache_size[i];
-		const char *name = pgtable_cache_name[i];
-
-		pr_debug("Allocating page table cache %s (#%d) "
-			"for size: %08x...\n", name, i, size);
-		pgtable_cache[i] = kmem_cache_create(name,
-						     size, size,
-						     SLAB_PANIC,
-						     zero_ctor);
-	}
+	pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor);
+	pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor);
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 7123472801d9..690ca7b0dcf6 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -78,7 +78,7 @@ spufs_destroy_inode(struct inode *inode)
 }
 
 static void
-spufs_init_once(struct kmem_cache *cachep, void *p)
+spufs_init_once(void *p)
 {
 	struct spufs_inode_info *ei = p;
 
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index 0b0ec6e04753..46911bcbf17b 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -293,7 +293,7 @@ void pmb_unmap(unsigned long addr)
 	} while (pmbe);
 }
 
-static void pmb_cache_ctor(struct kmem_cache *cachep, void *pmb)
+static void pmb_cache_ctor(void *pmb)
 {
 	struct pmb_entry *pmbe = pmb;
 
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 81d0560eaea2..ee261005b363 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -309,7 +309,7 @@ void show_mem(void)
 
 struct kmem_cache *pgtable_cache __read_mostly;
 
-static void pgd_ctor(struct kmem_cache *cache, void* addr)
+static void pgd_ctor(void* addr)
 {
 	pte_t* ptep = (pte_t*)addr;
 	int i;
diff --git a/drivers/usb/mon/mon_text.c b/drivers/usb/mon/mon_text.c
index 5e3e4e9b6c77..1f715436d6d3 100644
--- a/drivers/usb/mon/mon_text.c
+++ b/drivers/usb/mon/mon_text.c
@@ -87,7 +87,7 @@ struct mon_reader_text {
 
 static struct dentry *mon_dir;		/* Usually /sys/kernel/debug/usbmon */
 
-static void mon_text_ctor(struct kmem_cache *, void *);
+static void mon_text_ctor(void *);
 
 struct mon_text_ptr {
 	int cnt, limit;
@@ -720,7 +720,7 @@ void mon_text_del(struct mon_bus *mbus)
 /*
  * Slab interface: constructor.
  */
-static void mon_text_ctor(struct kmem_cache *slab, void *mem)
+static void mon_text_ctor(void *mem)
 {
 	/*
 	 * Nothing to initialize. No, really!
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9e421eeb672b..26f3b43726bb 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 4e0309566406..3a89094f93d0 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode)
 	kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct affs_inode_info *ei = (struct affs_inode_info *) foo;
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7e3faeef6818..250d8c4d66e4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -27,7 +27,7 @@
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
-static void afs_i_init_once(struct kmem_cache *cachep, void *foo);
+static void afs_i_init_once(void *foo);
 static int afs_get_sb(struct file_system_type *fs_type,
 		      int flags, const char *dev_name,
 		      void *data, struct vfsmount *mnt);
@@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb)
 /*
  * initialise an inode cache slab element prior to any use
  */
-static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode)
+static void afs_i_init_once(void *_vnode)
 {
 	struct afs_vnode *vnode = _vnode;
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e8717de3bab3..02c6e62b72f8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -289,7 +289,7 @@ befs_destroy_inode(struct inode *inode)
         kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
         struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 053e690ec9ed..0ed57b5ee012 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -264,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bfs_inode_info *bi = foo;
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 10d8a0aa871a..dcf37cada369 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode)
 	kmem_cache_free(bdev_cachep, bdi);
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index 109b261192d9..5fd497cdd6f3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3272,7 +3272,7 @@ int bh_submit_read(struct buffer_head *bh)
 EXPORT_SYMBOL(bh_submit_read);
 
 static void
-init_buffer_head(struct kmem_cache *cachep, void *data)
+init_buffer_head(void *data)
 {
 	struct buffer_head *bh = data;
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 22857c639df5..fe5f6809cba6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -766,7 +766,7 @@ const struct file_operations cifs_dir_ops = {
 };
 
 static void
-cifs_init_once(struct kmem_cache *cachep, void *inode)
+cifs_init_once(void *inode)
 {
 	struct cifsInodeInfo *cifsi = inode;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2f58dfc70083..830f51abb971 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
 	kmem_cache_free(coda_inode_cachep, ITOC(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct coda_inode_info *ei = (struct coda_inode_info *) foo;
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 6f403cfba14f..448dfd597b5f 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -578,7 +578,7 @@ static struct file_system_type ecryptfs_fs_type = {
  * Initializes the ecryptfs_inode_info_cache when it is created
  */
 static void
-inode_info_init_once(struct kmem_cache *cachep, void *vptr)
+inode_info_init_once(void *vptr)
 {
 	struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
 
@@ -589,7 +589,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
-	void (*ctor)(struct kmem_cache *cache, void *obj);
+	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
 		.cache = &ecryptfs_auth_tok_list_item_cache,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d733531b55e2..567b134fa1f1 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode)
 	kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct efs_inode_info *ei = (struct efs_inode_info *) foo;
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 31308a3b0b8b..fd88c7b43e66 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -159,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 615788c6843a..8ddced384674 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -472,7 +472,7 @@ static void ext3_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1cb371dcd609..b5479b1dff14 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -595,7 +595,7 @@ static void ext4_destroy_inode(struct inode *inode)
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
 
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 3a9ecac8d61f..3222f51c41cf 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode)
 
 static struct kmem_cache *fat_cache_cachep;
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct fat_cache *cache = (struct fat_cache *)foo;
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 23676f9d79ce..6d266d793e2c 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -498,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode)
 	kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7d2f7d6e22e2..d2249f174e20 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -956,7 +956,7 @@ static inline void unregister_fuseblk(void)
 }
 #endif
 
-static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void fuse_inode_init_once(void *foo)
 {
 	struct inode * inode = foo;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index bcc668d0fadd..bb2cc303ac29 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,7 +24,7 @@
 #include "util.h"
 #include "glock.h"
 
-static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_inode_once(void *foo)
 {
 	struct gfs2_inode *ip = foo;
 
@@ -33,7 +33,7 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
 	ip->i_alloc = NULL;
 }
 
-static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_glock_once(void *foo)
 {
 	struct gfs2_glock *gl = foo;
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ac2ec5ef66e4..4abb1047c689 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfs_init_once(struct kmem_cache *cachep, void *p)
+static void hfs_init_once(void *p)
 {
 	struct hfs_inode_info *i = p;
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3859118531c7..e834e578c93f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void hfsplus_init_once(struct kmem_cache *cachep, void *p)
+static void hfsplus_init_once(void *p)
 {
 	struct hfsplus_inode_info *i = p;
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f63a699ec659..b8ae9c90ada0 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index dbd01d262ca4..3f58923fb39b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -705,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = {
 };
 
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
 
diff --git a/fs/inode.c b/fs/inode.c
index 35b6414522ea..b6726f644530 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -224,7 +224,7 @@ void inode_init_once(struct inode *inode)
 
 EXPORT_SYMBOL(inode_init_once);
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct inode * inode = (struct inode *) foo;
 
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 044a254d526b..26948a6033b6 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
 	kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct iso_inode_info *ei = foo;
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7da69eae49e4..efd401257ed9 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode)
 	kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
 }
 
-static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo)
+static void jffs2_i_init_once(void *foo)
 {
 	struct jffs2_inode_info *f = foo;
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 854ff0ec574f..c350057087dd 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -182,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 
 #endif
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct metapage *mp = (struct metapage *)foo;
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 359c091d8965..3630718be395 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -760,7 +760,7 @@ static struct file_system_type jfs_fs_type = {
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
 
diff --git a/fs/locks.c b/fs/locks.c
index 01490300f7cb..5eb259e3cd38 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock);
  * Initialises the fields of the file lock which are invariant for
  * free file_locks.
  */
-static void init_once(struct kmem_cache *cache, void *foo)
+static void init_once(void *foo)
 {
 	struct file_lock *lock = (struct file_lock *) foo;
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 523d73713418..d1d1eb84679d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode)
 	kmem_cache_free(minix_inode_cachep, minix_i(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct minix_inode_info *ei = (struct minix_inode_info *) foo;
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2e5ab1204dec..d642f0e5b365 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode)
 	kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index df23f987da6b..52daefa2f521 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1242,7 +1242,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #endif
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct nfs_inode *nfsi = (struct nfs_inode *) foo;
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3e76f3b216bc..4a46743b5077 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache;
 struct kmem_cache *ntfs_big_inode_cache;
 
 /* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void ntfs_big_inode_init_once(void *foo)
 {
 	ntfs_inode *ni = (ntfs_inode *)foo;
 
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index e48aba698b77..533a789c3ef8 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
 	return writelen;
 }
 
-static void dlmfs_init_once(struct kmem_cache *cachep,
-			    void *foo)
+static void dlmfs_init_once(void *foo)
 {
 	struct dlmfs_inode_private *ip =
 		(struct dlmfs_inode_private *) foo;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ccecfe5094fa..2560b33889aa 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1118,7 +1118,7 @@ bail:
 	return status;
 }
 
-static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
+static void ocfs2_inode_init_once(void *data)
 {
 	struct ocfs2_inode_info *oi = data;
 
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d17b4fd204e1..9f5b054f06b9 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -430,7 +430,7 @@ static struct file_system_type openprom_fs_type = {
 	.kill_sb	= kill_anon_super,
 };
 
-static void op_inode_init_once(struct kmem_cache * cachep, void *data)
+static void op_inode_init_once(void *data)
 {
 	struct op_inode_info *oi = (struct op_inode_info *) data;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 02eca2ed9dd7..b37f25dc45a5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -94,7 +94,7 @@ static void proc_destroy_inode(struct inode *inode)
 	kmem_cache_free(proc_inode_cachep, PROC_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct proc_inode *ei = (struct proc_inode *) foo;
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b31ab78052b3..2aad1044b84c 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode)
 	kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 2ec748ba0bd3..879e54d35c2d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -521,7 +521,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 3f13d491c7c7..8e51a2aaa977 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -577,7 +577,7 @@ static void romfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct romfs_inode_info *ei = foo;
 
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 376ef3ee6ed7..3528f40ffb0f 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode)
 	kmem_cache_free(smb_inode_cachep, SMB_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct smb_inode_info *ei = (struct smb_inode_info *) foo;
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c5d60de0658f..df0d435baa48 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode)
 	kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *p)
+static void init_once(void *p)
 {
 	struct sysv_inode_info *si = (struct sysv_inode_info *)p;
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 00eb9c68ad03..ca1e2d4e03cc 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1841,7 +1841,7 @@ static struct file_system_type ubifs_fs_type = {
 /*
  * Inode slab cache constructor.
  */
-static void inode_slab_ctor(struct kmem_cache *cachep, void *obj)
+static void inode_slab_ctor(void *obj)
 {
 	struct ubifs_inode *ui = obj;
 	inode_init_once(&ui->vfs_inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 44cc702f96cc..5698bbf83bbf 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode)
 	kmem_cache_free(udf_inode_cachep, UDF_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct udf_inode_info *ei = (struct udf_inode_info *)foo;
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 227c9d700040..3e30e40aa24d 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1302,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode)
 	kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
 }
 
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 5e9564902976..a20683cf74dd 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name)
 
 static inline kmem_zone_t *
 kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-		     void (*construct)(kmem_zone_t *, void *))
+		     void (*construct)(void *))
 {
 	return kmem_cache_create(zone_name, size, 0, flags, construct);
 }
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 742b2c7852c1..943381284e2e 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -843,7 +843,6 @@ xfs_fs_destroy_inode(
 
 STATIC void
 xfs_fs_inode_init_once(
-	kmem_zone_t		*zonep,
 	void			*vnode)
 {
 	inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 41103910f8a2..9ff8e8499403 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -58,7 +58,7 @@ int slab_is_available(void);
 
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
-			void (*)(struct kmem_cache *, void *));
+			void (*)(void *));
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
 void kmem_cache_free(struct kmem_cache *, void *);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d117ea2825a9..5bad61a93f65 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -85,7 +85,7 @@ struct kmem_cache {
 	struct kmem_cache_order_objects min;
 	gfp_t allocflags;	/* gfp flags to use on each alloc */
 	int refcount;		/* Refcount for slab cache destroy */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 	int inuse;		/* Offset to metadata */
 	int align;		/* Alignment */
 	const char *name;	/* Name (only for display!) */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 1fdc2eb2f6d8..474984f9e032 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -207,7 +207,7 @@ static int mqueue_get_sb(struct file_system_type *fs_type,
 	return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt);
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index b99d73e971a4..80e83e459b17 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,7 +1442,7 @@ long do_fork(unsigned long clone_flags,
 #define ARCH_MIN_MMSTRUCT_ALIGN 0
 #endif
 
-static void sighand_ctor(struct kmem_cache *cachep, void *data)
+static void sighand_ctor(void *data)
 {
 	struct sighand_struct *sighand = data;
 
diff --git a/lib/idr.c b/lib/idr.c
index 3476f8203e97..e728c7fccc4d 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -607,7 +607,7 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 }
 EXPORT_SYMBOL(idr_replace);
 
-static void idr_cache_ctor(struct kmem_cache *idr_layer_cache, void *idr_layer)
+static void idr_cache_ctor(void *idr_layer)
 {
 	memset(idr_layer, 0, sizeof(struct idr_layer));
 }
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9c4f1ffa2864..be86b32bc874 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1183,7 +1183,7 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 EXPORT_SYMBOL(radix_tree_tagged);
 
 static void
-radix_tree_node_ctor(struct kmem_cache *cachep, void *node)
+radix_tree_node_ctor(void *node)
 {
 	memset(node, 0, sizeof(struct radix_tree_node));
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index abbd29f7c43f..39ae5a9bf382 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -138,7 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 		anon_vma_free(anon_vma);
 }
 
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
 	struct anon_vma *anon_vma = data;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 1089092aecaf..952d361774bb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2352,7 +2352,7 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..918f04f7fef1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
 	unsigned int dflags;		/* dynamic flags */
 
 	/* constructor func */
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *obj);
 
 /* 5) cache creation/removal */
 	const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	unsigned long flags, void (*ctor)(void *))
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-			cachep->ctor(cachep, objp + obj_offset(cachep));
+			cachep->ctor(objp + obj_offset(cachep));
 
 		if (cachep->flags & SLAB_RED_ZONE) {
 			if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
 		if (cachep->ctor)
-			cachep->ctor(cachep, objp);
+			cachep->ctor(objp);
 #endif
 		slab_bufctl(slabp)[i] = i + 1;
 	}
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON)
-		cachep->ctor(cachep, objp);
+		cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
 	if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
 		printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de268eb7ac70..d8fbd4d1bfa7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -525,12 +525,11 @@ struct kmem_cache {
 	unsigned int size, align;
 	unsigned long flags;
 	const char *name;
-	void (*ctor)(struct kmem_cache *, void *);
+	void (*ctor)(void *);
 };
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-	size_t align, unsigned long flags,
-	void (*ctor)(struct kmem_cache *, void *))
+	size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *c;
 
@@ -575,7 +574,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 		b = slob_new_page(flags, get_order(c->size), node);
 
 	if (c->ctor)
-		c->ctor(c, b);
+		c->ctor(b);
 
 	return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 77c21cf53ff9..b7e2cd5d82db 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1012,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
 
 static unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	/*
 	 * Enable debugging if selected on the kernel commandline.
@@ -1040,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
 	unsigned long flags, const char *name,
-	void (*ctor)(struct kmem_cache *, void *))
+	void (*ctor)(void *))
 {
 	return flags;
 }
@@ -1103,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
 	setup_object_debug(s, page, object);
 	if (unlikely(s->ctor))
-		s->ctor(s, object);
+		s->ctor(object);
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2286,7 +2286,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 		const char *name, size_t size,
 		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	memset(s, 0, kmem_size);
 	s->name = name;
@@ -3042,7 +3042,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 
 static struct kmem_cache *find_mergeable(size_t size,
 		size_t align, unsigned long flags, const char *name,
-		void (*ctor)(struct kmem_cache *, void *))
+		void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
@@ -3082,8 +3082,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-		size_t align, unsigned long flags,
-		void (*ctor)(struct kmem_cache *, void *))
+		size_t align, unsigned long flags, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
diff --git a/net/socket.c b/net/socket.c
index 1310a82cbba7..8ef8ba81b9e2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -265,7 +265,7 @@ static void sock_destroy_inode(struct inode *inode)
 			container_of(inode, struct socket_alloc, vfs_inode));
 }
 
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
 	struct socket_alloc *ei = (struct socket_alloc *)foo;
 
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 5a9b0e7828cd..23a2b8f6dc49 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -897,7 +897,7 @@ static struct file_system_type rpc_pipe_fs_type = {
 };
 
 static void
-init_once(struct kmem_cache * cachep, void *foo)
+init_once(void *foo)
 {
 	struct rpc_inode *rpci = (struct rpc_inode *) foo;
 
-- 
cgit v1.2.3


From f419a2e3b64def707e1384ee38abb77f99af5f6d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Jul 2008 00:07:17 -0400
Subject: [PATCH] kill nameidata passing to permission(), rename to
 inode_permission()

Incidentally, the name that gives hundreds of false positives on grep
is not a good idea...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ecryptfs/inode.c |  2 +-
 fs/namei.c          | 22 +++++++++-------------
 fs/nfsd/nfsfh.c     |  2 +-
 fs/nfsd/vfs.c       |  4 ++--
 fs/utimes.c         |  2 +-
 fs/xattr.c          |  2 +-
 include/linux/fs.h  |  2 +-
 ipc/mqueue.c        |  2 +-
 8 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'ipc')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index f25caf2b0887..89209f00f9c7 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -830,7 +830,7 @@ out:
 static int
 ecryptfs_permission(struct inode *inode, int mask)
 {
-	return permission(ecryptfs_inode_to_lower(inode), mask, NULL);
+	return inode_permission(ecryptfs_inode_to_lower(inode), mask);
 }
 
 /**
diff --git a/fs/namei.c b/fs/namei.c
index 396cb3e5c364..5029b93ebbd5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -227,13 +227,9 @@ int generic_permission(struct inode *inode, int mask,
 	return -EACCES;
 }
 
-int permission(struct inode *inode, int mask, struct nameidata *nd)
+int inode_permission(struct inode *inode, int mask)
 {
 	int retval;
-	struct vfsmount *mnt = NULL;
-
-	if (nd)
-		mnt = nd->path.mnt;
 
 	if (mask & MAY_WRITE) {
 		umode_t mode = inode->i_mode;
@@ -293,7 +289,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
  */
 int vfs_permission(struct nameidata *nd, int mask)
 {
-	return permission(nd->path.dentry->d_inode, mask, nd);
+	return inode_permission(nd->path.dentry->d_inode, mask);
 }
 
 /**
@@ -310,7 +306,7 @@ int vfs_permission(struct nameidata *nd, int mask)
  */
 int file_permission(struct file *file, int mask)
 {
-	return permission(file->f_path.dentry->d_inode, mask, NULL);
+	return inode_permission(file->f_path.dentry->d_inode, mask);
 }
 
 /*
@@ -1262,7 +1258,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 {
 	int err;
 
-	err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd);
+	err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1310,7 +1306,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	if (err)
 		return ERR_PTR(err);
 
-	err = permission(base->d_inode, MAY_EXEC, NULL);
+	err = inode_permission(base->d_inode, MAY_EXEC);
 	if (err)
 		return ERR_PTR(err);
 	return __lookup_hash(&this, base, NULL);
@@ -1400,7 +1396,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
 	BUG_ON(victim->d_parent->d_inode != dir);
 	audit_inode_child(victim->d_name.name, victim, dir);
 
-	error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 	if (IS_APPEND(dir))
@@ -1437,7 +1433,7 @@ static inline int may_create(struct inode *dir, struct dentry *child,
 		return -EEXIST;
 	if (IS_DEADDIR(dir))
 		return -ENOENT;
-	return permission(dir,MAY_WRITE | MAY_EXEC, nd);
+	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
 }
 
 /* 
@@ -2543,7 +2539,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 	 * we'll need to flip '..'.
 	 */
 	if (new_dir != old_dir) {
-		error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
+		error = inode_permission(old_dentry->d_inode, MAY_WRITE);
 		if (error)
 			return error;
 	}
@@ -2897,7 +2893,7 @@ EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
 EXPORT_SYMBOL(path_lookup);
 EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(permission);
+EXPORT_SYMBOL(inode_permission);
 EXPORT_SYMBOL(vfs_permission);
 EXPORT_SYMBOL(file_permission);
 EXPORT_SYMBOL(unlock_rename);
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index f45451eb1e38..ea37c96f0445 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -51,7 +51,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
 		/* make sure parents give x permission to user */
 		int err;
 		parent = dget_parent(tdentry);
-		err = permission(parent->d_inode, MAY_EXEC, NULL);
+		err = inode_permission(parent->d_inode, MAY_EXEC);
 		if (err < 0) {
 			dput(parent);
 			break;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ad1ad59e3742..18060bed5267 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1953,12 +1953,12 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 		return 0;
 
 	/* This assumes  NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
-	err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
+	err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
 
 	/* Allow read access to binaries even when mode 111 */
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
-		err = permission(inode, MAY_EXEC, NULL);
+		err = inode_permission(inode, MAY_EXEC);
 
 	return err? nfserrno(err) : 0;
 }
diff --git a/fs/utimes.c b/fs/utimes.c
index dad679d3a158..dc28b7826259 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -96,7 +96,7 @@ static int utimes_common(struct path *path, struct timespec *times)
 			goto mnt_drop_write_and_out;
 
 		if (!is_owner_or_cap(inode)) {
-			error = permission(inode, MAY_WRITE, NULL);
+			error = inode_permission(inode, MAY_WRITE);
 			if (error)
 				goto mnt_drop_write_and_out;
 		}
diff --git a/fs/xattr.c b/fs/xattr.c
index 4706a8b1f495..b96222e05ba0 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -63,7 +63,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 			return -EPERM;
 	}
 
-	return permission(inode, mask, NULL);
+	return inode_permission(inode, mask);
 }
 
 int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 527b9e482f99..9d2de4cadabd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1770,7 +1770,7 @@ extern int do_remount_sb(struct super_block *sb, int flags,
 extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *);
-extern int permission(struct inode *, int, struct nameidata *);
+extern int inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int,
 		int (*check_acl)(struct inode *, int));
 
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 474984f9e032..96fb36cd9874 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -638,7 +638,7 @@ static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) {
+	if (inode_permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE])) {
 		dput(dentry);
 		mntput(mqueue_mnt);
 		return ERR_PTR(-EACCES);
-- 
cgit v1.2.3